In [None]:
%matplotlib inline  
import glob
import zipfile
#import multiprocessing as mp
import gc
#from collections import defaultdict
import csv
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats


In [None]:
wikistat = pd.DataFrame.from_csv("./data/wikitrafficrank_ISO.txt", sep = "\t")

In [None]:
ciastat = pd.DataFrame.from_csv("./data/CIA_LANGUAGE_MODIFIED.txt", sep = "\t").reset_index()

In [None]:
primlang = pd.DataFrame.from_csv("./data/primary_country_for_lang.txt", sep = "\t")

In [None]:
GDP_OECD = pd.read_csv("./OECD data/Economy/Domestic Product/Gross domestic product(GDP).csv")

In [None]:
ciastat_filtered = ciastat.merge(primlang, on = "ISO2")[['prim_lang_code', 'COUNTRY', 'ISO2', 'ISO3', 'POPULATION', 'INTERNET_USER', 'GDP(PPA)']]

In [None]:
wikistat = pd.DataFrame.from_csv("./data/Allwiki_output_modify_langinfo.txt", sep = ",")

In [None]:
wikistat = wikistat.merge(ciastat_filtered, left_on = "lang_code", right_on = "prim_lang_code")
del wikistat['prim_lang_code']

In [None]:
patent_applicant = pd.DataFrame.from_csv("./data/sup_r001_gini_applicant.csv", sep = "\t")

In [None]:
applnstat = wikistat.merge(patent_applicant, left_on = "ISO2", right_on = "appln_auth")

In [None]:
types = ['wiki', 'wiktionary', 'wikibooks', 'wikiquote', 'wikisource', 'wikinews', 'wikivoyage', 'wikiversity']
types_map = {'wiki':'Wikipedia',
             'wiktionary':'Wiktionary',
             'wikibooks':'Wikibooks', 
             'wikiquote':'Wikiquote',
             'wikisource':'Wikisource',
             'wikinews':'Wikinews', 
             'wikivoyage':'Wikivoyage',
             'wikiversity':'Wikiversity'}

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15,10))

graph = pd.DataFrame(columns=["$N_p$", "$N_a$", "$N_e$", "$S$"])
for tp in types:
    temp_table = applnstat[:][(applnstat["type"] == tp) & (applnstat["lang_name"] != "")& (applnstat["person_id"].notnull())]
    tp = types_map[tp]     
    graph.set_value(tp, '$N_p$', scipy.stats.pearsonr(temp_table['person_id'], temp_table['4:Num_Editor'])[0])
    graph.set_value(tp, '$N_a$', scipy.stats.pearsonr(temp_table['person_id'], temp_table['6:Num_Article'])[0])
    graph.set_value(tp, '$N_e$', scipy.stats.pearsonr(temp_table['person_id'], temp_table['3:Num_Edit'])[0])
    graph.set_value(tp, '$S$', scipy.stats.pearsonr(temp_table['person_id'], temp_table['7:Wiki_Size'])[0])
    
graph.index_name = "Type"
axes[0,0].set_ylabel("Pearson correlation for $N_{ap}$")
axes[0,0].set_ylim((min(graph.values.min() * 1.2, graph.values.min() * 0.8), graph.values.max() * 1.2))
graph.plot.bar(rot=40, width = 0.8, ax = axes[0,0])

graph = pd.DataFrame(columns=["$N_p$", "$N_a$", "$N_e$", "$S$"])
for tp in types:
    temp_table = applnstat[:][(applnstat["type"] == tp) & (applnstat["lang_name"] != "")& (applnstat["person_id"].notnull())]
    tp = types_map[tp]     
    graph.set_value(tp, '$N_p$', scipy.stats.kendalltau(temp_table['person_id'], temp_table['4:Num_Editor'])[0])
    graph.set_value(tp, '$N_a$', scipy.stats.kendalltau(temp_table['person_id'], temp_table['6:Num_Article'])[0])
    graph.set_value(tp, '$N_e$', scipy.stats.kendalltau(temp_table['person_id'], temp_table['3:Num_Edit'])[0])
    graph.set_value(tp, '$S$', scipy.stats.kendalltau(temp_table['person_id'], temp_table['7:Wiki_Size'])[0])

graph.index_name = "Type"
axes[1,0].set_ylabel("Kendall-tau for $N_{ap}$")
axes[1,0].set_ylim((min(graph.values.min() * 1.2, graph.values.min() * 0.8), graph.values.max() * 1.2))
graph.plot.bar(rot=40, width = 0.8, ax = axes[1,0])

graph = pd.DataFrame(columns=["$N_p$", "$N_a$", "$N_e$", "$S$"])
for tp in types:
    temp_table = applnstat[:][(applnstat["type"] == tp) & (applnstat["lang_name"] != "")& (applnstat["appln_id"].notnull())]
    tp = types_map[tp]     
    graph.set_value(tp, '$N_p$', scipy.stats.pearsonr(temp_table['appln_id'], temp_table['4:Num_Editor'])[0])
    graph.set_value(tp, '$N_a$', scipy.stats.pearsonr(temp_table['appln_id'], temp_table['6:Num_Article'])[0])
    graph.set_value(tp, '$N_e$', scipy.stats.pearsonr(temp_table['appln_id'], temp_table['3:Num_Edit'])[0])
    graph.set_value(tp, '$S$', scipy.stats.pearsonr(temp_table['appln_id'], temp_table['7:Wiki_Size'])[0])
    

graph.index_name = "Type"
axes[0,1].set_ylabel("Pearson correlation for $N_{pt}$")
axes[0,1].set_ylim((min(graph.values.min() * 1.2, graph.values.min() * 0.8), graph.values.max() * 1.2))
graph.plot.bar(rot=40, width = 0.8, ax = axes[0,1])

graph = pd.DataFrame(columns=["$N_p$", "$N_a$", "$N_e$", "$S$"])
for tp in types:
    temp_table = applnstat[:][(applnstat["type"] == tp) & (applnstat["lang_name"] != "")& (applnstat["appln_id"].notnull())]
    tp = types_map[tp]     
    graph.set_value(tp, '$N_p$', scipy.stats.kendalltau(temp_table['appln_id'], temp_table['4:Num_Editor'])[0])
    graph.set_value(tp, '$N_a$', scipy.stats.kendalltau(temp_table['appln_id'], temp_table['6:Num_Article'])[0])
    graph.set_value(tp, '$N_e$', scipy.stats.kendalltau(temp_table['appln_id'], temp_table['3:Num_Edit'])[0])
    graph.set_value(tp, '$S$', scipy.stats.kendalltau(temp_table['appln_id'], temp_table['7:Wiki_Size'])[0])

graph.index_name = "Type"
axes[1,1].set_ylabel("Kendall-tau correlation for $N_{pt}$")
axes[1,1].set_ylim((min(graph.values.min() * 1.2, graph.values.min() * 0.8), graph.values.max() * 1.2))
graph.plot.bar(rot=40, width = 0.8, ax = axes[1,1])

patent_inventor = pd.DataFrame.from_csv("sup_r001_gini_inventor.csv", sep = "\t")
patent_inventor

invstat = wikistat.merge(patent_inventor, left_on = "ISO2", right_on = "appln_auth")
graph = pd.DataFrame(columns=["$N_p$", "$N_a$", "$N_e$", "$S$"])
for tp in types:
    temp_table = invstat[:][(invstat["type"] == tp) & (invstat["lang_name"] != "")& (invstat["person_id"].notnull())]
    tp = types_map[tp]     
    graph.set_value(tp, '$N_p$', scipy.stats.pearsonr(temp_table['person_id'], temp_table['4:Num_Editor'])[0])
    graph.set_value(tp, '$N_a$', scipy.stats.pearsonr(temp_table['person_id'], temp_table['6:Num_Article'])[0])
    graph.set_value(tp, '$N_e$', scipy.stats.pearsonr(temp_table['person_id'], temp_table['3:Num_Edit'])[0])
    graph.set_value(tp, '$S$', scipy.stats.pearsonr(temp_table['person_id'], temp_table['7:Wiki_Size'])[0])
    
graph.index_name = "Type"
axes[0,2].set_ylabel("Pearson correlation for $N_{iv}$")
axes[0,2].set_ylim((min(graph.values.min() * 1.2, graph.values.min() * 0.8), graph.values.max() * 1.2))
graph.plot.bar(rot=40, width = 0.8, ax = axes[0,2])

graph = pd.DataFrame(columns=["$N_p$", "$N_a$", "$N_e$", "$S$"])
for tp in types:
    temp_table = invstat[:][(invstat["type"] == tp) & (invstat["lang_name"] != "")& (invstat["person_id"].notnull())]
    tp = types_map[tp]     
    graph.set_value(tp, '$N_p$', scipy.stats.kendalltau(temp_table['person_id'], temp_table['4:Num_Editor'])[0])
    graph.set_value(tp, '$N_a$', scipy.stats.kendalltau(temp_table['person_id'], temp_table['6:Num_Article'])[0])
    graph.set_value(tp, '$N_e$', scipy.stats.kendalltau(temp_table['person_id'], temp_table['3:Num_Edit'])[0])
    graph.set_value(tp, '$S$', scipy.stats.kendalltau(temp_table['person_id'], temp_table['7:Wiki_Size'])[0])

graph.index_name = "Type"
axes[1,2].set_ylabel("Kendall-tau correlation for $N_{iv}$")
axes[1,2].set_ylim((min(graph.values.min() * 1.2, graph.values.min() * 0.8), graph.values.max() * 1.2))
graph.plot.bar(rot=40, width = 0.8, ax = axes[1,2])

index = 0
labellist = ["a", "b", "c", "d", "e", "f", "g", "h"]
for ax in fig.get_axes():
    ax.get_yaxis().set_label_coords(-0.1,0.5)
    ax.yaxis.label.set_fontsize(14)
    ax.tick_params(axis='x', labelsize=14)
    for tick in ax.xaxis.get_majorticklabels():
        tick.set_horizontalalignment("right")
    ax.legend(loc=2, ncol=4)
    ax.set_ylim(-0.3, 1.2)
    ax.axhline(0, color='black', lw=1)    
    ax.text(-0.15, 0.95, labellist[index], fontsize=14, weight='bold', transform=ax.transAxes)
    SHIFT = 0.1
    index += 1


plt.tight_layout()
plt.savefig("FIG_SUP_017.pdf)

plt.show()

In [None]:
paper_author = pd.DataFrame.from_csv("./data/sup_r002_gini_scopus_paper.csv", sep = "\t")
paperstat = wikistat.merge(paper_author, left_on = "ISO3", right_on = "af_country")
paperstat

fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10,10))

graph = pd.DataFrame(columns=["$N_p$", "$N_a$", "$N_e$", "$S$"])
for tp in types:
    temp_table = paperstat[:][(paperstat["type"] == tp) & (paperstat["lang_name"] != "")& (paperstat["auid"].notnull())]
    tp = types_map[tp]        
    graph.set_value(tp, '$N_p$', scipy.stats.pearsonr(temp_table['auid'], temp_table['4:Num_Editor'])[0])
    graph.set_value(tp, '$N_a$', scipy.stats.pearsonr(temp_table['auid'], temp_table['6:Num_Article'])[0])
    graph.set_value(tp, '$N_e$', scipy.stats.pearsonr(temp_table['auid'], temp_table['3:Num_Edit'])[0])
    graph.set_value(tp, '$S$', scipy.stats.pearsonr(temp_table['auid'], temp_table['7:Wiki_Size'])[0])
    
    
graph.index_name = "Type"
axes[0,0].set_ylabel("Pearson correlation for $N_{au}$")
axes[0,0].set_ylim((min(graph.values.min() * 1.2, graph.values.min() * 0.8), graph.values.max() * 1.2))
graph.plot.bar(rot=40, width = 0.8, ax = axes[0,0])

graph = pd.DataFrame(columns=["$N_p$", "$N_a$", "$N_e$", "$S$"])
for tp in types:
    temp_table = paperstat[:][(paperstat["type"] == tp) & (paperstat["lang_name"] != "")& (paperstat["auid"].notnull())]
    tp = types_map[tp]        
    graph.set_value(tp, '$N_p$', scipy.stats.kendalltau(temp_table['auid'], temp_table['4:Num_Editor'])[0])
    graph.set_value(tp, '$N_a$', scipy.stats.kendalltau(temp_table['auid'], temp_table['6:Num_Article'])[0])
    graph.set_value(tp, '$N_e$', scipy.stats.kendalltau(temp_table['auid'], temp_table['3:Num_Edit'])[0])
    graph.set_value(tp, '$S$', scipy.stats.kendalltau(temp_table['auid'], temp_table['7:Wiki_Size'])[0])

graph.index_name = "Type"
axes[1,0].set_ylabel("Kendall-tau correlation for $N_{au}$")
axes[1,0].set_ylim((min(graph.values.min() * 1.2, graph.values.min() * 0.8), graph.values.max() * 1.2))
graph.plot.bar(rot=40, width = 0.8, ax = axes[1,0])


print types
graph = pd.DataFrame(columns=["$N_p$", "$N_a$", "$N_e$", "$S$"])
for tp in types:
    temp_table = paperstat[:][(paperstat["type"] == tp) & (paperstat["lang_name"] != "")& (paperstat["eid"].notnull())]
    tp = types_map[tp]        
    graph.set_value(tp, '$N_p$', scipy.stats.pearsonr(temp_table['eid'], temp_table['4:Num_Editor'])[0])
    graph.set_value(tp, '$N_a$', scipy.stats.pearsonr(temp_table['eid'], temp_table['6:Num_Article'])[0])
    graph.set_value(tp, '$N_e$', scipy.stats.pearsonr(temp_table['eid'], temp_table['3:Num_Edit'])[0])
    graph.set_value(tp, '$S$', scipy.stats.pearsonr(temp_table['eid'], temp_table['7:Wiki_Size'])[0])
    
graph.index_name = "Type"
axes[0,1].set_ylabel("Pearson correlation for $N_{ar}$")
axes[0,1].set_ylim((min(graph.values.min() * 1.2, graph.values.min() * 0.8), graph.values.max() * 1.2))
graph.plot.bar(rot=40, width = 0.8, ax = axes[0,1])


graph = pd.DataFrame(columns=["$N_p$", "$N_a$", "$N_e$", "$S$"])
for tp in types:
    temp_table = paperstat[:][(paperstat["type"] == tp) & (paperstat["lang_name"] != "")& (paperstat["eid"].notnull())]
    tp = types_map[tp]         
    graph.set_value(tp, '$N_p$', scipy.stats.kendalltau(temp_table['eid'], temp_table['4:Num_Editor'])[0])
    graph.set_value(tp, '$N_a$', scipy.stats.kendalltau(temp_table['eid'], temp_table['6:Num_Article'])[0])
    graph.set_value(tp, '$N_e$', scipy.stats.kendalltau(temp_table['eid'], temp_table['3:Num_Edit'])[0])
    graph.set_value(tp, '$S$', scipy.stats.kendalltau(temp_table['eid'], temp_table['7:Wiki_Size'])[0])

graph.index_name = "Type"
axes[1,1].set_ylabel("Kendall-tau correlation for $N_{ar}$")
axes[1,1].set_ylim((min(graph.values.min() * 1.2, graph.values.min() * 0.8), graph.values.max() * 1.2))
graph.plot.bar(rot=40, width = 0.8, ax = axes[1,1])

index = 0
labellist = ["a", "b", "c", "d", "e", "f", "g", "h"]
for ax in fig.get_axes():
    ax.get_yaxis().set_label_coords(-0.1,0.5)
    ax.yaxis.label.set_fontsize(14)
    ax.tick_params(axis='x', labelsize=14)
    for tick in ax.xaxis.get_majorticklabels():
        tick.set_horizontalalignment("right")
    ax.legend(loc=2, ncol=4)
    ax.set_ylim(-0.3, 1.2)
    ax.axhline(0, color='black', lw=1)    
    ax.text(-0.15, 0.95, labellist[index], fontsize=14, weight='bold', transform=ax.transAxes)
    SHIFT = 0.1
    index += 1

    

    
plt.tight_layout()
plt.savefig("FIG_SUP_018.pdf)
plt.show()
