In [None]:
'''
The purpose of this script is to collect basic properties of an article.
'''

In [None]:
import requests
import pandas as pd
import numpy as np
import wikipedia as wiki
import datetime as dt
from datetime import datetime
import requests
import os
import math
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
#Specify Article
lang= 'en'
title = 'World War II'
wikiID = "32927" #from Page info of article, e.g. https://en.wikipedia.org/w/index.php?title=World_War_II&action=info

In [None]:
def chunks(l, n):
        for i in range(0, len(l), n):
            yield l[i:i+n]   

In [None]:
def query(request):
    request['action'] = 'query'
    request['format'] = 'json'
    lastContinue = {}
    while True:
        req = request.copy()
        req.update(lastContinue)
        result = requests.get('https://%s.wikipedia.org/w/api.php'%lang, params=req).json()
        if 'error' in result:
            raise Error(result['error'])
        if 'warnings' in result:
            print(result['warnings'])
        if 'query' in result:
            yield result['query']
        if 'continue' not in result:
            break
        lastContinue = result['continue']

In [None]:
#GET WIKIDATA ID
for result in query({'titles': title, 'prop': 'pageprops'}):
    ID = result
    ID = ID['pages'][wikiID]['pageprops']['wikibase_item']
    print(ID)

In [None]:
#PageLength
for result in query({'titles': title, 'prop': 'info'}):
    LENGTH = result
    LENGTH = LENGTH['pages'][wikiID]['length']
    print('The article has',LENGTH,'words.')

In [None]:
#GET INTERNAL OUT LINKS
links_list = []

for result in chunks(list(query({'titles': title, 'prop': 'links', 'pllimit':'max','plnamespace':'0', 'redirects':''})),50):
    LINKS = result
    for l in LINKS:
        links = l['pages'][wikiID]['links']
        for m in links:
            n = m['title']
            links_list.append(n)

links_df = pd.DataFrame(links_list, columns=['Article'])
print('There are ',len(links_df.index),'links.\n')

In [None]:
#GET EXTERNAL OUT LINKS
exlinks_list = []

for result in chunks(list(query({'titles': title, 'prop': 'extlinks', 'ellimit':'max'})),50):
    EXLINKS = result
    for x in EXLINKS:
        exlinks = x['pages'][wikiID]['extlinks']
        for extl in exlinks:
            exlinks = extl['*']
            exlinks_list.append(exlinks)

exlinks_df = pd.DataFrame(exlinks_list, columns=['Ext Link'])
print('There are ',len(exlinks_df.index),'external links.\n')

In [None]:
#GET COUNT OF LANGUAGE LINKS
llinks_list = []

for result in query({'titles': title, 'prop': 'langlinks', 'lllimit':'max'}):
    LLINKS = result
    llinks = LLINKS['pages'][wikiID]['langlinks']
    for ll in llinks:
        llinks = ll['lang']
        llinks_list.append(llinks)
        
llinks_df = pd.DataFrame(llinks_list, columns=['Lang Link'])
print(len(llinks_df.index),'other language editions have articles about this concept.\n')

In [None]:
#AGE
for result in chunks(list(query({'titles':title, 'prop': 'revisions', 'rvprop':'timestamp','rvlimit':'max'})),50):
    TIMESTAMPS = result
    for r in TIMESTAMPS:
        s = r['pages'][wikiID]['revisions']
        for t in s:
            first = t['timestamp']
            dt = datetime.strptime(first, "%Y-%m-%dT%H:%M:%S%z")
            now = datetime.now()
            delta = now - dt.replace(tzinfo=None)
            years = delta.days/365

print(years)

In [None]:
#IMAGES
images_list = []

for result in query({'titles': title, 'prop': 'images', 'imlimit':'max'}):
    IMAGES = result
    images = IMAGES['pages'][wikiID]['images']
    for i in images:
        images = i['title']
        images_list.append(images)

images_df = pd.DataFrame(images_list, columns=['Images'])
print('There are',len(images_df.index),'images in this article.\n')

In [None]:
#EDITORS   
editor_list = []

for result in chunks(list(query({'titles': title, 'prop': 'contributors', 'pclimit':'max'})),50):
    EDITORS = result
    for e in EDITORS:
        editors = e['pages'][wikiID]['contributors']
        anoneds = e['pages'][wikiID]['anoncontributors']
        for ed in editors:
            eds = ed['name']
            editor_list.append(eds)

editor_df = pd.DataFrame(editor_list, columns=['Username'])
edscount = len(editor_df)
print('There are ',edscount,'editors.\n')
print('There are', anoneds,'anonymous editors.\n')
totaleds = anoneds+edscount
print('In total,', totaleds, 'editors have edited this article.\n')
propanon = anoneds/totaleds
print('Proportion of anonymous editors:', propanon)

In [None]:
#Count edits per editor
rev_list = []
for result in chunks(list(query({'titles': title, 'prop': 'revisions', 'rvprop':'user','rvlimit':'max'})),50):
    REVISIONS = result
    for r in REVISIONS:
        revs = r['pages'][wikiID]['revisions']
        for ed in revs:
            eds = ed['user']
            rev_list.append(eds)

rev_df = pd.DataFrame(rev_list, columns=['Editor'])
totaledits = len(rev_df.index)
meanedits = round(totaledits/totaleds)

print('This article has been edited',len(rev_df.index),'times.\n')
print('On average, this article was edited', meanedits,'times per editor.\n')

In [None]:
# GET SUMMARY STATISTICS ABOUT EDITORS
eds_vc = rev_df['Editor'].value_counts().sort_values(ascending=False)
eds_vc.describe()

In [None]:
# SHOW MOST ACTIVE EDITORS
display(eds_vc.head(30))

In [None]:
# VISUALISE DISTRIBUTION OF EDITS PER EDITOR
sns.set()
fig, ax = plt.subplots()
sns.distplot(eds_vc, kde=False, rug=False).set_title('Edits per Editor')
plt.ylabel("Count of Editors")
plt.xlabel("Count of Edits")
plt.tight_layout() 
#ax.set_xscale('log')
ax.set_yscale('log')
plt.savefig('editspereditor.png')

In [None]:
# PRODUCE EDIT HISTORY DATAFRAME
editZ = []
        
# PRODUCE DATAFRAME OF EDITS PER YEAR
for i in rev_list:
    for k,v in i.items():
        v = datetime.strptime(v, "%Y-%m-%dT%H:%M:%S%z")
        v = v.strftime('%Y')
        editZ.append(v)
        
df = pd.Series(editZ).value_counts()

# PRODUCE CUMULATIVE VALUES
df.index = df.index.astype(int)
df = df.fillna(0)
df = df.to_frame()
df.index.name = 'Date'
df.columns = ['Edits EN']
df = df.sort_values(by=['Date'])
df['Cum EN'] = df['Edits EN'].cumsum(axis = 0)

# NORMALISE DATA
counts =  df['Edits EN'].sum()
df_norm = df / counts

# NOW WE HAVE 2 DATAFRAMES OF EDITS PER YEAR: df AND df_norm
display(df)
#display(df_norm)

In [None]:
# NOW LET'S COMPARE ARTICLES IN 2 LANGUAGE EDITIONS
# LET'S COMPARE (1) EDIT HISTORIES & (2) HYPERLINKS

In [None]:
# German WW2 Article:
lang = 'de'
title = 'Zweiter Weltkrieg'
wikiID = '5767'

In [None]:
def query(request):
    request['action'] = 'query'
    request['format'] = 'json'
    lastContinue = {}
    while True:
        req = request.copy()
        req.update(lastContinue)
        result = requests.get('https://%s.wikipedia.org/w/api.php'%lang, params=req).json()
        if 'error' in result:
            raise Error(result['error'])
        if 'warnings' in result:
            print(result['warnings'])
        if 'query' in result:
            yield result['query']
        if 'continue' not in result:
            break
        lastContinue = result['continue']

In [None]:
# GET EDIT HISTORY OF SECOND ARTICLE AND CALL IT rev_list2
rev_list2 = []
editZ2 = []
for result in chunks(list(query({'titles': title, 'prop': 'revisions', 'rvprop':'timestamp','rvlimit':'max'})),50):
    REVISIONS = result
    for r in REVISIONS:
        revs = r['pages'][wikiID2]['revisions']
        for t in revs:
            rev_list2.append(t) 

# PRODUCE DATAFRAME OF EDITS PER YEAR
for i in rev_list2:
    for k,v in i.items():
        v = datetime.strptime(v, "%Y-%m-%dT%H:%M:%S%z")
        v = v.strftime('%Y')
        editZ2.append(v)
df2 = pd.Series(editZ2).value_counts()

# PRODUCE CUMULATIVE VALUES
df2.index = df2.index.astype(int)
df2 = df2.fillna(0)
df2 = df2.to_frame()
df2.index.name = 'Date'
df2.columns = ['Edits DE']
df2 = df2.sort_values(by=['Date'])
df2['Cum DE'] = df2['Edits DE'].cumsum(axis = 0)

# NORMALISE DATA
counts2 =  df2['Edits DE'].sum()
df2_norm = df2 / counts2

# NOW WE HAVE 2 DATAFRAMES OF EDITS PER YEAR: df AND df_norm
display(df2)
#display(df2_norm)

In [None]:
# MERGE NORMALISED EDIT HISTORIES
edit_df = pd.merge(df_norm, df2_norm, how='inner',on='Date',left_index=True)
edit_df = edit_df.drop(columns=['Cum EN','Cum DE'])

cum_df = pd.merge(df_norm, df2_norm, how='inner',on='Date',left_index=True)
cum_df = cum_df.drop(columns=['Edits EN','Edits DE'])

In [None]:
# VISUALISE EDIT HISTORIES
sns.set()

edit_df.plot()
ax.figure.legend()
plt.xlabel('Date')
ax.xaxis.set_ticks(np.arange(2001, 2020))
plt.locator_params(axis='x', nbins=15)
plt.ylabel('Proportion of Edits')
plt.title('Edit History of Articles')
plt.savefig('edit_history.png')
plt.show()

In [None]:
# VISUALISE CUMULATIVE EDIT HISTORIES
cum_df.plot()
ax.figure.legend()
plt.xlabel('Date')
ax.xaxis.set_ticks(np.arange(2001, 2020))
plt.locator_params(axis='x', nbins=15)
plt.ylabel('Proportion of Edits')
plt.title('Cumulative Edit History of Articles')
plt.savefig('cum_edithistory.png')
plt.show()

In [None]:
'''
Now that we have compared the edit histories, let's compare the link similarity of the articles.
To do this, we need to:
(1) collect the hyperlinks
(2) collect WikiData IDs for hyperlinks
(3) compute cosine similarity of IDs
'''

In [None]:
# First reintroduce article specification:

#Specify Article
lang= 'en'
title = 'World War II'
wikiID = "32927" #from Page info of article, e.g. https://en.wikipedia.org/w/index.php?title=World_War_II&action=info

#GET LINKS in LANG1
links_list = []

for result in chunks(list(query({'titles': title, 'prop': 'links', 'pllimit':'max','plnamespace':'0', 'redirects':''})),50):
    LINKS = result
    for l in LINKS:
        links = l['pages'][wikiID]['links']
        for m in links:
            n = m['title']
            links_list.append(n)

links_df = pd.DataFrame(links_list, columns=['Article'])

#GET IDS OF LINKS
ids = {}
for i in chunks(links_list, 50):
    for result in query({'titles':'|'.join(i), 'prop':'pageprops','ppprop':'wikibase_item'}):
        for j in result['pages'].values():
            if 'pageprops' in j.keys():
                ids[j['title']] = j['pageprops']['wikibase_item']
                
#Make Dataframe of Articles and IDs
ids_df = pd.DataFrame.from_dict(ids,orient='index', columns=['WikiData ID'])
ids_df = ids_df.reset_index()
ids_df.columns = [lang, 'WikiData ID']
display(ids_df.head())

#Get link summaries
ids = len(ids_df.index)
links = len(links_df.index)
print('In total there are', links,'links in the', lang ,'article. Of these,', ids, 'are unique links.')

In [None]:
# Now specify German article again
lang = 'de'
title = 'Zweiter Weltkrieg'
wikiID = '5767'

links_list2 = []

for result in chunks(list(query({'titles': title, 'prop': 'links', 'pllimit':'max','plnamespace':'0', 'redirects':''})),50):
    LINKS = result
    for l in LINKS:
        links = l['pages'][wikiID2]['links']
        for m in links:
            n = m['title']
            links_list2.append(n)

links_df2 = pd.DataFrame(links_list2, columns=['Article'])

#GET IDS OF LINKS
Articles2 = list(links_df2['Article'])

ids2 = {}
for i in chunks(Articles2, 50):
    for result in query({'titles':'|'.join(i), 'prop':'pageprops','ppprop':'wikibase_item'}):
        for j in result['pages'].values():
            if 'pageprops' in j.keys():
                ids2[j['title']] = j['pageprops']['wikibase_item']

#Make Dataframe of Articles and IDs
ids_df2 = pd.DataFrame.from_dict(ids2,orient='index', columns=['WikiData ID'])
ids_df2 = ids_df2.reset_index()
ids_df2.columns = [lang2, 'WikiData ID']
display(ids_df2.head())

#Get link summaries
ids2 = len(ids_df2.index)
links2 = len(links_df2.index)
print('In total there are', links2,'links in the',lang2,'article. Of these,', ids2, 'are unique links.')

In [None]:
#Merge Dataframes and get link overlap
mergeddf = pd.merge(ids_df, ids_df2, on='WikiData ID')
mergeddf = mergeddf.set_index(['WikiData ID'])
common = len(mergeddf)
display(mergeddf.head())
unmerged = ids_df[~ids_df.isin(mergeddf)].dropna()
unmerged2 = ids_df[~ids_df2.isin(mergeddf)].dropna()

overlap = "{0:.00%}".format(common / ids)
overlap2 = "{0:.00%}".format(common / ids2)

print(overlap,'of unique links in the EN article overlap with unqiue links in the DE article.')
print(overlap2,'of unique links in the DE article overlap with unqiue links in the EN article.\n')

In [None]:
#Make Dataframe for Cosine Similarity

EN = ids_df.copy()
DE = ids_df2.copy()
EN['In_DE'] = EN['WikiData ID'].isin(DE['WikiData ID'])
DE['In_EN'] = DE['WikiData ID'].isin(EN['WikiData ID'])
EN = EN.drop(columns =['en'])
DE = DE.drop(columns =['de'])

#Combine Dataframes
COMB = pd.concat([EN,DE],ignore_index = True)
COMB = COMB.drop(columns=['In_EN','In_DE'])
COMB = COMB.drop_duplicates()

#Map values
values = set(EN['WikiData ID'])
values2 = set(DE['WikiData ID'])
COMB['EN'] = COMB['WikiData ID'].isin(values).astype(int)
COMB['DE'] = COMB['WikiData ID'].isin(values2).astype(int)

#Print Values
print(COMB['EN'].value_counts())
print(COMB['DE'].value_counts())
#display(COMB)

#vectorise df columns
cs_en = COMB['EN'].values.reshape(1,-1)
cs_de = COMB['DE'].values.reshape(1,-1)

#get cosine simalarity of links
cosine_similarity(cs_en,cs_de)