# Wikipedia Analysis

This notebook IS NOT an skeleton. Is a sample of instruccions to analyse in classroom the Wikipedia Dataset provided by Databricks

During the class we will fill cells to implement the PageRank algorithm

In [0]:
import pandas as pd
import re

In [0]:
from pyspark.sql.types import *
from pyspark.sql.types import ArrayType, StringType,LongType
from pyspark.sql.functions import size, explode, collect_list 

In [0]:
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [0]:
wikipediaDF=spark.read.parquet("dbfs:/databricks-datasets/wikipedia-datasets/data-001/en_wikipedia/articles-only-parquet")

In [0]:
N=wikipediaDF.count()

In [0]:
PartialWikipediaDF=wikipediaDF.sample(fraction=0.0001,seed=0).cache()

In [0]:
PartialWikipediaDF.count()

In [0]:
display(PartialWikipediaDF)

In [0]:
def parse_links(document_body):
  data=re.findall(r'\[\[(.+?)\]\]',document_body)
  if (len(data)>0):
    links=[s.lower() for s in data]
  else:
    links=[]
  return links

In [0]:
test="{{Use Indian English|date=April 2015}} {{Infobox person | name = Shavez Khan | image = | caption = | birth_date = | birth_place = India | nationality = India | residence = [[Mumbai]], India | occupation = [[Actor]] | years_active = present | height = }} '''Shavez Khan''' is an [[India]]n television [[actor]]. He has done his roles in various Indian television shows like Shaitaan,<ref>{{cite web|url=http://www.tellychakkar.com/tv/tv-news/shavez-khan-feature-episodic-of-colors-shaitaan|title=Shavez Khan to feature in an episodic of Colors' Shaitaan|work=Tellychakkar|date=11 April 2013|accessdate=24 April 2015}}</ref> [[Encounter (Indian TV series)|Encounter]], [[Ek Hasina Thi (TV series)|Ek Hasina Thi]], [[Savdhaan India]],<ref>{{cite web|url=http://www.tellychakkar.com/tv/tv-news/shavez-khan-anshul-singh-and-damini-joshi-episodic-of-savdhan-india-140915|title=Shavez Khan, Anshul Singh and Damini Joshi in an episodic of Savdhan India|work=Tellychakkar|date=15 September 2014|accessdate=24 April 2015}}</ref> [[SuperCops vs Supervillains]],<ref>{{cite web|url=http://www.tellychakkar.com/tv/tv-news/rituraj-singh-and-shavez-khan-life-oks-shapath-141009|title=Rituraj Singh and Shavez Khan in Life OK's Shapath|work=Tellychakkar|date=9 October 2014|accessdate=24 April 2015}}</ref> Pyaar Ka The End,<ref>{{cite web|url=http://www.tellychakkar.com/tv/tv-news/shavez-khan-bindass-pyaar-ka-the-end-141029|title=Shavez Khan in Bindass' Pyaar Ka The End|work=Tellychakkar|date=29 October 2014|accessdate=24 April 2015}}</ref> [[Pyaar Kii Ye Ek Kahaani]], [[MTV Fanaah]], [[Crime Patrol (TV series)|Crime Patrol]]. He has played his recent role in [[Sony Entertainment Television (India)|Sony TV]]'s [[C.I.D. (Indian TV series)|CID]].<ref>{{cite web|url=http://www.tellychakkar.com/tv/tv-news/shavez-khan-sony-tvs-cid-150417|title=Shavez Khan in Sony TV's CID|work=Tellychakkar|date=17 April 2015|accessdate=24 April 2015}}</ref> ==Television== *[[Colors (TV channel)|Colors]]'s Shaitaan *[[Sony Entertainment Television (India)|Sony TV]]'s [[Encounter (Indian TV series)|Encounter]], [[Crime Patrol (TV series)|Crime Patrol]] & [[C.I.D. (Indian TV series)|CID]] *[[Star Plus]]'s [[Ek Hasina Thi (TV series)|Ek Hasina Thi]] *[[Life OK]]'s [[Savdhaan India]] & [[SuperCops vs Supervillains]] *[[Bindass]]' Pyaar Ka The End *[[Star One]]'s [[Pyaar Kii Ye Ek Kahaani]] *[[MTV]]'s [[MTV Fanaah]] ==References== {{Reflist}} ==External links== {{Persondata | NAME = Khan, Shavez | ALTERNATIVE NAMES = | SHORT DESCRIPTION = Indian model and television actor | DATE OF BIRTH = <!--Birth date has been contested. Do not add without providing a reliably published source with a reputation for editorial oversight--> | PLACE OF BIRTH = India | DATE OF DEATH = | PLACE OF DEATH = }} {{DEFAULTSORT:Khan, Shavez}} [[Category:Living people]] [[Category:Indian male television actors]] [[Category:Actors in Hindi television]] [[Category:Indian television personalities]]"

In [0]:
parse_links(test)

In [0]:
parse_links_udf = udf(parse_links,ArrayType(StringType()))

In [0]:
tolower_udf= udf(lambda x: x.lower())

In [0]:
from pyspark.sql.functions import *

In [0]:
dataDF1=PartialWikipediaDF.select(lower("title").alias("title"),"id")

In [0]:
title_idDF=wikipediaDF.select(lower("title").alias("title"),"id")

In [0]:
title_idPDF=title_idDF.toPandas()

In [0]:
TempForwardDF=PartialWikipediaDF.select("title","id",parse_links_udf("text").alias("links"))

In [0]:
display(TempForwardDF)

In [0]:
def titles2id(links,titleidPDF):
  data_titles=titleidPDF
  if (len(links)>0):
    ids=data_titles[data_titles.title.isin(links)].id.to_list()
  else:
    ids=[]
  return list(set(ids))

In [0]:
titles2id_UDF=udf(lambda x: titles2id(x,title_idPDF),ArrayType(LongType(),False))

In [0]:
ForwardDF=TempForwardDF.select("id",titles2id_UDF("links").alias("links")).cache()

In [0]:
display(ForwardDF)

In [0]:
OutgoingsLinksCountersDF=ForwardDF.select("id",size("links").alias("counter"))

In [0]:
display(OutgoingsLinksCountersDF)

In [0]:
TemporalReverseLinks=ForwardDF.select("id",explode("links").alias("t_link"))

In [0]:
TemporalReverseLinks.show(10)

In [0]:
ReverseDF=TemporalReverseLinks.groupBy("t_link").agg(collect_list ("id").alias("links")).cache()

In [0]:
display(ReverseDF)

In [0]:
pageRankPDF=ReverseDF.select("t_link").toPandas()

In [0]:
pageRankPDF["PR"]= 0.85/N #N is the total number of documents in the Wikipedia 2015

In [0]:
display(pageRankPDF)

In [0]:
def new_pagerank(links, current_pr, counters):
  n_pr = 0;
  for l in links:
    #get_ current_pr
    #get counter
    n_pr += current_pr/counter
  new_pr = 0.85/N+0.15*n_pr
  return new_pr

In [0]:
new_pagerank_udf = udf (lambda (x,y): new_pagerank(x,pagerankPDF,y), DoubleType())

In [0]:
# WHILE LOOP WITH SEVERAL EXIT CONDITIONS
new_pagerank_udf = udf (lambda (x,y): new_pagerank(x,pagerankPDF,y), DoubleType())
NewPageRankDF=ReverseDF.select(ReverseDF["t_link"],new_page_rank_udf(ReverseDF["links"],ReverseDF["counters"]).alias("PR"))
pagerankPDF=NewPageRankDF.toPandas()