In [1]:
import pandas as pd
import numpy as np
import wikipedia as wiki
import networkx as nx
import requests
import os
import math
import datetime as dt
from collections import Counter
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
lang = 'en'
#lang = 'de'

In [None]:
#Get nodes from network
#Import Edgelists
os.chdir('path')
DF = pd.read_csv('%s_edgelist.csv'%lang)

#Produce NETWORK
NET = nx.from_pandas_edgelist(DF, create_using=nx.DiGraph())
NET.name = "HYPERLINK NETWORK"
print(nx.info(NET))
network_df = pd.DataFrame(index=list(NET.nodes))
network_df.index.name = 'Article'
network_df['Article'] = network_df.index

In [3]:
#create list of articles
Articles = list(network_df['Article'])
df = pd.DataFrame(index=Articles, columns=['Length'])
display(df)

Unnamed: 0,Length
Zweiter Weltkrieg,
Erster Weltkrieg,
Kalter Krieg,


In [3]:
#Define query
def chunks(l, n):
        for i in range(0, len(l), n):
            yield l[i:i+n] 
            
def query(request):
    request['action'] = 'query'
    request['format'] = 'json'
    lastContinue = {}
    while True:
        req = request.copy()
        req.update(lastContinue)
        result = requests.get('https://%s.wikipedia.org/w/api.php' %lang, params=req).json()
        if 'error' in result:
            raise Error(result['error'])
        if 'warnings' in result:
            print(result['warnings'])
        if 'query' in result:
            yield result['query']
        if 'continue' not in result:
            break
        lastContinue = result['continue']

In [None]:
#get page id
for i in chunks(Articles, 50):
    for i in query({'titles':'|'.join(i), 'prop': 'info'}):
        for j in i['pages'].values():
            if 'pageid' in j.keys():
                df.loc[j['title'], 'page'] = j['pageid']

In [6]:
#Age of Article
age = {}
for i in chunks(Articles, 1):
    for j in query({'titles':'|'.join(i), 'prop': 'revisions', 'rvprop':'timestamp','rvlimit':'max'}):
        for k in j['pages'].values():
            if 'revisions' in k.keys():
                age[k['title']] = k['revisions'][-1]
                
for k,v in age.items():
    for l,w in v.items():
        age[k] = w
        
for k,v in age.items():
    age[k] = (datetime.datetime.now() - datetime.datetime.strptime(v, "%Y-%m-%dT%H:%M:%S%z").replace(tzinfo=None)).days/365
    df['Age'] = df.index.to_series().map(age)

In [None]:
translations = {}

for i in chunks(Articles, 50):
    tparams ={}
    tparams['titles'] = '|'.join(i)
    tparams['prop'] = 'langlinks'
    tparams['lllang'] = 'en' #or 'de'         

    for j in list(query(tparams, lang)):
        for k in j['pages'].values():
            if 'langlinks' in k.keys():
                translations[k['title']] = k['langlinks'][0]['*']
                
network_df['Translation'] = network_df['Article'].map(translations)
network_df[network_df['Translation'].isna()==True][['Article', 'Translation']]

network_df['Linked EN'] = network_df['Translation'].isna()==False
print(network_df['Linked EN'].value_counts(normalize=True))

In [None]:
#GET Length
for i in chunks(Articles, 50):
    for i in query({'titles':'|'.join(i), 'prop': 'info'}):
        for j in i['pages'].values():
            if 'length' in j.keys():
                df.loc[j['title'], 'Length'] = j['length']

In [None]:
#EDITORS   
ed_dic={}
anon_dic={}

for i in chunks(Articles, 50):
    for j in query({'titles':'|'.join(i), 'prop': 'contributors', 'pclimit':'max'}):
        for k in j['pages'].values():
            if 'contributors' in k.keys():
                if k['title'] in ed_dic.keys():
                    ed_dic[k['title']].extend(k['contributors'])
                else:
                    ed_dic[k['title']] = k['contributors']
            if 'anoncontributors' in k.keys():
                anon_dic[k['title']] = k['anoncontributors']

for k, v in ed_dic.items():
    ed_dic[k] = len(v)
    df['Reg-Editors'] = df.index.to_series().map(ed_dic)

for k, v in anon_dic.items():
    df['Anon-Editors'] = df.index.to_series().map(anon_dic)

In [None]:
#EDITS
rev_dic = {}

for i in chunks(Articles,1):
    for j in query({'titles':'|'.join(i), 'prop': 'revisions', 'rvprop':'user|size','rvlimit':'max'}):
            for k in j['pages'].values():
                if 'revisions' in k.keys():
                    if k['title'] in rev_dic.keys():
                        rev_dic[k['title']].extend(k['revisions'])
                    else:
                        rev_dic[k['title']] = k['revisions']

for k, v in rev_dic.items():
    rev_dic[k] = len(v)
    df['Edits'] = df.index.to_series().map(rev_dic)

In [None]:
# CREATE NEW COLUMNS
df = df.fillna(0)
df['Total-Editors'] = df['Reg-Editors'] + df['Anon-Editors']
df['Prop-Anonymous'] = df['Anon-Editors'] / df['Total-Editors']
df['Edits/Editors'] = df['Edits'] / df['Total-Editors']

In [4]:
#scrape in-text citations

#Define query
def chunks(l, n):
        for i in range(0, len(l), n):
            yield l[i:i+n] 
            
#Define query
def query(request):
    global result
    request['action'] = 'parse'
    request['format'] = 'json'
    lastContinue = {}
    while True:
        try:
            req = request.copy()
            req.update(lastContinue)
            result = requests.get('https://%s.wikipedia.org/w/api.php'%lang, params=req).json()
            if 'error' in result:
                return []     
            if 'IncompleteRead' in result:
                return []
            if 'warnings' in result:
                print(result['warnings'])
            if 'parse' in result:
                yield result['parse']
            if 'continue' not in result:
                break
            lastContinue = result['continue']           
        except:
            pass

In [38]:
#GET CITATIONS for EN NET
cits_dic = {}

for n,i in enumerate(chunks(Articles, 1)):
    print('Getting references ',round(100*n/len(Articles),3),'%')
    for j in query({'page':'|'.join(i), 'prop':'text'}):
        cits = j['text']
        for k,v in cits.items():
            cits = v
            cits = cits.rsplit("Edit section: Footnotes")[0] #rename edit sections according to language
            cits = cits.rsplit("Edit section: Citations")[0]
            cits = cits.rsplit("Edit section: References")[0]
            cits = cits.rsplit("Edit section: Bibliography")[0]
            cits = cits.rsplit("Edit section: See also")[0]
            cits = reg1.findall(cits)
            cits = len(cits)
            cits_dic[j['title']] = cits

for k,v in cits_dic.items():
    cits_dic[k] = v
    df['Citations'] = df.index.to_series().map(cits_dic) 
    
#CREATE COLUMN: citations per word
df['CitePerWord'] = df['Length'] / df['Citations']
display(df)

Unnamed: 0,Length,Citations
Zweiter Weltkrieg,,355
Erster Weltkrieg,,364
Kalter Krieg,,39


In [None]:
#EXPORT DATA
os.chdir('path')
df.to_csv('%s_Net_Attributes.csv' %lang, encoding='utf_8_sig')