# Wikipedia pagerank calculating

## Define the main functions

In [6]:
# extract pageviews for each title

def to_key_value(line):
    params = line.split()
    return (params[0], int(params[1])) # (title, pageviews)

# sum title pageviews

def sum_pageviews(item):
    pageviews = sum([views if views else 0 for views in item[1]])
    return (item[0], pageviews)

# reduce_by_key wikipedia dump

def reduce_by_key(dump):
    return dump.map(to_key_value).reduceByKey(lambda a, b: a + b)
            
# detect the pages with 0 number of views

def none_to_zero(x):
    year = x[1][0] if x[1][0] else 0
    days90 = x[1][1] if x[1][1] else 0
    return (x[0], days90, year)

# change to csv format

def to_line(data):
    return ' '.join(unicode(d) for d in data)

# calculate the rank of title

def rank(x):
    global prev
    if not prev['views_year'] or prev['views_year'] != x[0][2]:
        prev['views_year'] = x[0][2]
        prev['rank'] = x[1] + 1
    return to_line([x[0][0], x[0][1], prev['views_year'], prev['rank']])



## Read the files

In [4]:
# We have scraped the needed files from Wikipedia to 2 different folders: 
# 1. For all year (10.2014 - 09.2015)
# 2. For last 90 days (07.2015 - 09.2015)

# read all files fron folders
wikipedia_dump_year = sc.textFile("/vagrant/awo128/dump_month/*")
wikipedia_dump_90days = sc.textFile("/vagrant/awo128/dump_month/20150[7-9]")

## Number of pageviews calculating

In [7]:
# join 90 days titles and year titles

titles_joined = reduce_by_key(wikipedia_dump_year).fullOuterJoin(reduce_by_key(wikipedia_dump_90days))
print titles_joined.take(100)

[(u'The_Deep_End', (2, 1)), (u'Diogo_C%C3%A3o', (4, 1)), (u'Deh_Now-e_Qalandaran', (1, None)), (u'File:Ambert.jpg', (1, 1)), (u'Randy_Adams', (2, None)), (u'The_Heaven_Sword_and_Dragon_Saber', (37, 18)), (u'Roy_Scammell', (2, 2)), (u'File:Wavelets_-_DWT.png', (1, 1)), (u'Amazon.co.UK', (1, None)), (u'Hispanic_(U.S._Census)', (14, 7)), (u"Special:WhatLinksHere/File:Gill's_Dictionary_of_the_Chinook_Jargon_01.jpg", (1, 1)), (u'Freeze_tag/Freeze_tag', (1, None)), (u'A_little_knowledge_is_a_dangerous_thing', (1, None)), (u'Sara_Steelman', (1, 1)), (u'File:Krispy_Kreme_glazed_donut.JPG', (1, 1)), (u'97.1_WXYT', (1, 1)), (u'Talk:Pump-action', (1, 1)), (u'ZiU-9', (1, 1)), (u'File:Personality_systematics.jpg', (1, 1)), (u'naaktje', (1, 1)), (u'Modes_of_a_linear_field', (1, None)), (u'Talk:Kiril_Vajarov', (1, None)), (u'frou-frou', (3, 1)), (u'Dharmesh_Yelande', (172, 154)), (u'xenodochial', (1, 1)), (u'Huasteco_de_oriente', (1, 1)), (u'Prince_William_of_Denmark', (1, 1)), (u'Talk:Fencing_at_the

## Pagerank calculating

In [9]:
# check New York, David Beckham, CRonaldo titles
test_titles = titles_joined.filter(lambda item: 'David_Beckham' == item[0] or 'New_York' == item[0] or 'Cristiano_Ronaldo' == item[0])
print test_titles.take(1000)

[(u'New_York', (551, 208)), (u'David_Beckham', (715, 280)), (u'Cristiano_Ronaldo', (1614, 556))]


In [10]:
# sort in decreasing order, giving the same indexes for the titles with the same number of views 

sorted_titles = titles_joined.map(none_to_zero).sortBy(lambda x: -x[2]).zipWithIndex()
print sorted_titles.take(50)

[((u'en', 3907536, 8036991), 0), ((u'Main_Page', 719783, 1839427), 1), ((u'Novak_Djokovic', 166, 91255), 2), ((u'Oscar_Wilde', 92, 53747), 3), ((u'Special:Search', 21828, 43911), 4), ((u'Andy_Murray', 66, 40902), 5), ((u'Special:HideBanners', 36648, 37381), 6), ((u'Angelsberg', 25832, 32311), 7), ((u'Australian_Open', 30, 23283), 8), ((u'Roger_Federer', 105, 21228), 9), ((u'Rafael_Nadal', 145, 18173), 10), ((u'List_of_Grand_Slam_men%27s_singles_champions', 20, 13998), 11), ((u'United_Passions', 13030, 13037), 12), ((u'Chris_Kyle', 3931, 12631), 13), ((u'Ruby_Rose', 11328, 11395), 14), ((u'2015_Australian_Open', 21, 11324), 15), ((u'Jurassic_World', 10845, 11165), 16), ((u'Boris_Becker', 24, 10837), 17), ((u'Langston_Hughes', 31, 8589), 18), ((u'Am%C3%A9lie_Mauresmo', 10, 8227), 19), ((u'List_of_Australian_Open_men%27s_singles_champions', 8, 7599), 20), ((u'Orange_Is_the_New_Black', 6477, 7023), 21), ((u'2010_Sharm_el-Sheikh_shark_attacks', 6817, 6820), 22), ((u'Mortal_Kombat', 6530, 66

## Save results

In [11]:
# save results and calculate rank

prev = {'views_year': None, 'rank': None}   
sorted_titles.map(rank).coalesce(1,  shuffle=True).saveAsTextFile('/vagrant/awo128/Pages_final.csv')