## Longitudinal analysis 

This notebook implements the construction of WACNs and calculation of centrality measures used in the longitudinal analysis described in section 3 of the thesis. 

In [2]:
from MAG_network import CitationNetwork
import os 
from matplotlib import pyplot as plt
import pandas as pd
import findspark
import MAGspark 

import warnings
warnings.filterwarnings("ignore")

from matching import Matcher
import sys

# set environment variables
os.environ["SPARK_LOCAL_DIRS"] = "/home/laal/MAG/TMP"
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.242.b08-0.el7_7.x86_64"
os.environ['SPARK_HOME'] = "/home/laal/MAG/spark-3.0.2-bin-hadoop2.7"

In [3]:
mag, spark = MAGspark.get_mag_with_cluster_connection(jobid=45236, memory_per_executor=14000,
                                                      data_folderpath="/home/laal/MAG/DATA/")

['NAME STATE JOBID', 'train-emb-concatenated-shuffled PENDING 45207', 'baseline-sgd-tweets-shuffled-batch_size-2 PENDING 45210', 'baseline-bilstm-language-en-hi-ro PENDING 45227', 'baseline-bilstm-language-ar-arz PENDING 45226', 'baseline-bilstm-language-en-ne-ro PENDING 45228', 'piano_ppl RUNNING 45007', 'guitar_ppl RUNNING 45025', 'bass_ppl RUNNING 45026', 'sparkcluster RUNNING 45236', 'baseline-sgd-tweets-shuffled-batch_size-1 RUNNING 45209', 'baseline-sgd-tweets-shuffled-batch_size-0 RUNNING 45208', 'ant_bullet RUNNING 45234', 'ant_bullet RUNNING 45220', 'baseline-sgd-tweets-shuffled-units-1 RUNNING 45205', 'baseline-sgd-tweets-shuffled-units-2 RUNNING 45206', 'baseline-sgd-tweets-shuffled-units-0 RUNNING 45204', 'baseline-sgd-tweets-shuffled-architecture-0 RUNNING 45193', 'train-gpu RUNNING 45037', 'jupyter RUNNING 45235', 'train-gpu RUNNING 45188', 'train-gpu RUNNING 45187', 'lil_bobby RUNNING 45229', 'lil_bobby RUNNING 45230', 'lil_bobby RUNNING 45231', 'lil_bobby RUNNING 45232'

In [4]:
spark

### Projection of homogeneous slices of  paper references into WACNs

In [3]:
def build_wacn_slices(fos_id, fos_name, root_data_folder, csv_filepath):
    
    network = CitationNetwork(mag, fos_id=fos_id, fos_name=fos_name, root_data_folder=root_data_folder)
    network.check_references_and_citations(overwrite=False)

    print("Extracting ordered references")
    
    papers = network.mag.getDataframe('Papers')
    paper_refs = network.mag.getDataframe(network.paper_references_name)
    
    if not os.path.exists(csv_filepath):
        query = """
            SELECT pr.PaperId, pr.PaperReferenceId,
            p.Date as pubDate, 
            row_number() over (order by p.Date ASC) rownum
            FROM {} pr
            INNER JOIN Papers p ON pr.PaperId = p.PaperId
            WHERE p.Date is not null and p.Date <= '2020-12-31'
        """.format(network.paper_references_name)

        prefs_with_rownum = network.mag.query_sql(query)
        prefs_with_rownum.createOrReplaceTempView('PaperReferencesRownum')

        num_references = prefs_with_rownum.count()

        query = """
            SELECT 
            YEAR(pubDate), COUNT(*) as numReferences
            FROM PaperReferencesRownum 
            GROUP BY Year(pubDate)
            ORDER BY COUNT(*) DESC
        """
        max_year = network.mag.query_sql(query).toPandas()
        max_number = int(max_year.iloc[0].numReferences)

        print("The year {} had the maximum number of references produced: {}"
             .format(int(max_year.iloc[0]['year(pubDate)']), max_number))

        print("\nCreating slices")

        half_maxnum = int(max_number / 2)

        slice_records = []

        slice_idx = 0

        for i in range(1, num_references, half_maxnum):
            start_index = i
            end_index = i + max_number

            if end_index > num_references:
                end_index = num_references

            records = {
                'slice_index': slice_idx,
                'start_index': start_index,
                'end_index': end_index
            }

            slice_records.append(records)
            slice_idx += 1

            if end_index == num_references:
                break

        slices_df = pd.DataFrame.from_records(slice_records)

        all_indexes = list(slices_df.start_index.values) + list(slices_df.end_index.values)
        all_indexes_string = "(" + ",".join([str(x) for x in all_indexes]) + ")"

        query = """
            SELECT * FROM PaperReferencesRownum 
            WHERE rownum IN {}
        """.format(all_indexes_string)

        border_references = network.mag.query_sql(query)
        border_references_df = border_references.toPandas()

        slice_df = pd.merge(slices_df, border_references_df, how='left', left_on="start_index", right_on="rownum")
        slice_df = pd.merge(slice_df, border_references_df, how='left', left_on="end_index", right_on="rownum",
                            suffixes=('_lower', '_upper'))

        slice_df.to_csv(csv_filepath, index=False)
        print("{} slices identified. Saved to {}".format(len(slice_df), csv_filepath))
    
    slice_df = pd.read_csv(csv_filepath)
        
    # loop over rows and create networks
    for record in slice_df.to_dict('records'):

        print("Extracting {} slice index {} / {}".format(fos_name, record['slice_index'], len(slice_df)))

        network_name = "SimpleWeight{}2020Slice{}".format(fos_name, record['slice_index'])
        network.save_author_network(network_name, 
                                    mindate='1800-01-01', 
                                    maxdate='2020-12-31',
                                    min_rownum=record['start_index'],
                                    max_rownum=record['end_index'])

In [4]:
build_wacn_slices(162324750, 'Economics', root_data_folder="/home/laal/MAG/DATA", 
                  csv_filepath="/home/laal/MAG/CentralityFairness/SLICES/Economics2020.csv")

Paper references and citations available for Economics
Extracting ordered references
Extracting Economics slice index 0 / 30
Paper references and citations available for Economics
Network SimpleWeightEconomics2020Slice0 saved to /home/laal/MAG/DATA/NETWORKS/SimpleWeightEconomics2020Slice0.txt
Extracting Economics slice index 1 / 30
Paper references and citations available for Economics
Network SimpleWeightEconomics2020Slice1 saved to /home/laal/MAG/DATA/NETWORKS/SimpleWeightEconomics2020Slice1.txt
Extracting Economics slice index 2 / 30
Paper references and citations available for Economics
Network SimpleWeightEconomics2020Slice2 saved to /home/laal/MAG/DATA/NETWORKS/SimpleWeightEconomics2020Slice2.txt
Extracting Economics slice index 3 / 30
Paper references and citations available for Economics
Network SimpleWeightEconomics2020Slice3 saved to /home/laal/MAG/DATA/NETWORKS/SimpleWeightEconomics2020Slice3.txt
Extracting Economics slice index 4 / 30
Paper references and citations availabl

## Computing centrality measures on longitudinal WACNs

In [None]:
def build_wacn_graphs(fos_id, fos_name, root_data_folder, csv_filepath):
    
    network = CitationNetwork(mag, fos_id=fos_id, fos_name=fos_name, root_data_folder=root_data_folder)
    network.check_references_and_citations(overwrite=False)

    print("Extracting ordered references")
    
    # papers = network.mag.getDataframe('Papers')
    # paper_refs = network.mag.getDataframe(network.paper_references_name)
    
    if not os.path.exists(csv_filepath):
        return
    
    slice_df = pd.read_csv(csv_filepath)
        
    # loop over rows and create networks
    for record in slice_df.to_dict('records'):
        
        if os.path.exists("/home/laal/MAG/DATA/NETWORKS/SimpleWeight{}2020Slice{}Centrality.csv".
                         format(fos_name, record['slice_index'])):
            continue
        
        print("Extracting {} slice index {} / {}".format(fos_name, record['slice_index'], len(slice_df)))

        network_name = "SimpleWeight{}2020Slice{}".format(fos_name, record['slice_index'])
        
        network.load_author_author_network(network_name)
        graph, node_mapping, eweight = network.build_graph()
        df = network.compute_centralities(graph, node_mapping, eweight, pr_damping=0.85)
        df['sliceid'] = record['slice_index']
        
        if record['slice_index'] == 0:
            df.to_csv("/home/laal/MAG/DATA/NETWORKS/SimpleWeight{}2020SliceMasterCentrality.csv"
                      .format(fos_name), index=False, header=False, sep="\t")
        else:
            df.to_csv("/home/laal/MAG/DATA/NETWORKS/SimpleWeight{}2020SliceMasterCentrality.csv"
                      .format(fos_name), index=False, header=False, mode='a', sep="\t")        
            

In [None]:
build_wacn_graphs(185592680, 'Chemistry', root_data_folder="/home/laal/MAG/DATA", 
                  csv_filepath="/home/laal/MAG/CentralityFairness/SLICES/Chemistry2020.csv")