In [4]:
import pandas as pd
import numpy as np
from MAG_network import CitationNetwork

from matplotlib import pyplot as plt

import sys

sys.path.insert(0,"/home/laal/MAG/CentralityFairness/Evaluations")

from Evaluations.Evaluator import Evaluator


plt.style.use('ggplot')

In [5]:
econ_centrality = pd.read_csv("/home/laal/MAG/DATA/NETWORKS/SimpleWeightEconomics2020CentralityGendered.csv", 
                              sep="\t").query("Gender != -1")
psych_centrality = pd.read_csv("/home/laal/MAG/DATA/NETWORKS/SimpleWeightPsychology2020CentralityGendered.csv", 
                              sep="\t").query("Gender != -1")

In [7]:
METRICS = ['rND']
CENTRALITIES = ['PageRank', 'PageRank05', 'InDegreeStrength', 'Rank']
cutpoint = 10
    
def evaluate(centrality, data):
    eval = Evaluator(centrality=centrality, data=data)
    return eval.run_evaluations(METRICS)

In [None]:
score_records_econ = []
score_records_psych = []


for samplen in range(20000, econ_centrality.shape[0], 20000):
    scores_econ = evaluate('PageRank', econ_centrality.sample(samplen, random_state=samplen))
    score_records_econ.append(scores_econ)
    
    scores_psych = evaluate('PageRank', psych_centrality.sample(samplen, random_state=samplen))
    score_records_psych.append(scores_psych)
    
econ_scores = pd.DataFrame.from_records(score_records_econ)
psych_scores = pd.DataFrame.from_records(score_records_psych)

gender counts normalized:
1    0.7274
0    0.2726
Name: Gender, dtype: float64
gender counts:
1    14548
0     5452
Name: Gender, dtype: int64
sorted data.head():
              item      rank  PageRank05  InDegreeStrength  InDegree  \
760842  2397423007  0.001323    0.000483         33472.990   24635.0   
335001  2099607353  0.000858    0.000280         12258.514   12545.0   
422325  1970476475  0.000748    0.000344         25434.830   26085.0   
133206   621481202  0.000544    0.000231          8662.692    8800.0   
205359  2187985802  0.000509    0.000226          7085.694    8056.0   

        OutDegreeStrength  OutDegree  Gender   Rank  protected  
760842         4898.27440     1682.0       1  11754          0  
335001          947.90753      656.0       1  12087          0  
422325         5402.52100     3400.0       1  11959          0  
133206          955.72620      585.0       1  12753          0  
205359            0.00000        0.0       1  13121          0  
gender counts 

gender counts normalized:
1    0.50042
0    0.49958
Name: Gender, dtype: float64
gender counts:
1    50042
0    49958
Name: Gender, dtype: int64
sorted data.head():
               item      rank  PageRank05  InDegreeStrength  InDegree  \
1813673  2132289742  0.000860    0.000515         77976.390  155403.0   
1975234  2116067072  0.000502    0.000239         35619.027  107652.0   
164460    182094527  0.000289    0.000141         23576.057   44515.0   
1887623  2022872097  0.000285    0.000183         20053.947   27887.0   
2058547  2131203610  0.000263    0.000126         28869.295   47503.0   

         OutDegreeStrength  OutDegree  Gender   Rank  protected  
1813673          217.08562     1378.0       0  11958          1  
1975234         3024.47100     9136.0       1  11410          0  
164460          1392.47010     4293.0       1  12472          0  
1887623            8.00000       12.0       1  11861          0  
2058547         5146.71630     7204.0       1  12406          0  


In [None]:
econ_global = evaluate('PageRank', econ_centrality)
psych_global = evaluate('PageRank', psych_centrality)

econ_scores.index = list(range(20000, econ_centrality.shape[0], 20000))
psych_scores.index = list(range(20000, econ_centrality.shape[0], 20000))

In [None]:
plt.figure(figsize=(12, 8))
econ_scores['rND'].plot(style='-o', figsize=(12,8), label="Economics")
psych_scores['rND'].plot(style='-o', figsize=(12,8), label="Psychology")

plt.axhline(y=econ_global['rND'], label="Economics global", linestyle='--', alpha=0.4, color="orangered")
plt.axhline(y=psych_global['rND'], label="Psychology global", linestyle='--', alpha=1, color="lightblue")


plt.ylim(0.08, 0.14)
plt.title("rND with increasing sample sizes (PageRank). Cutpoint = {}".format(cutpoint))
plt.ylabel('rND')
plt.xlabel("Sample size")
plt.legend()
plt.savefig("rnd_sample_sizes_cutpoint_{}.png".format(cutpoint))
plt.show()