In [1]:
import pandas as pd
import MAGspark
import os

In [2]:
mag, spark = MAGspark.get_mag_with_cluster_connection(jobid=45069, memory_per_executor=14000,
                                                      data_folderpath="/home/laal/MAG/DATA/")

['NAME STATE JOBID', 'piano_ppl RUNNING 45007', 'guitar_ppl RUNNING 45025', 'bass_ppl RUNNING 45026', 'sparkcluster RUNNING 45069', 'train-emb-concatenated-shuffled RUNNING 45190', 'ant_bullet RUNNING 44949', 'ant_bullet RUNNING 44948', 'ant_bullet RUNNING 44947', 'ant_bullet RUNNING 44946', 'ant_bullet RUNNING 44945', 'train-gpu RUNNING 45037', 'train-gpu RUNNING 45188', 'train-gpu RUNNING 45187', 'jupyter RUNNING 45070', '']


In [3]:
field_mapping = {
        "Economics": "/home/laal/MAG/DATA/NETWORKS/SimpleWeightEconomics2020CentralityGendered.csv", 
        "Psychology": "/home/laal/MAG/DATA/NETWORKS/SimpleWeightPsychology2020CentralityGendered.csv",
        "Mathematics": "/home/laal/MAG/DATA/NETWORKS/SimpleWeightMathematics2020CentralityGendered.csv",
        "Chemistry": "/home/laal/MAG/DATA/NETWORKS/SimpleWeightChemistry2020CentralityGendered.csv",
    }

In [5]:
def append_name(field_mapping, base_folder="/home/laal/MAG/CentralityFairness/", top_n=1000):
    
    authors = mag.getDataframe('Authors')
    
    for field, fpath in field_mapping.items(): 
        
        print("Extracting author names for {}".format(field))
        
        df = pd.read_csv(fpath, sep="\t")
        mag.streams['RankingAuthors'] = ('TMP_ranking_authors.txt', ['AuthorId:long'])
        
        df[['AuthorId']].to_csv("/home/laal/MAG/DATA/TMP_ranking_authors.txt", index=False, sep="\t", header=False)
        
        ranking_authors = mag.getDataframe('RankingAuthors')
        
        query = """
            SELECT r.*, a.PaperCount, a.PaperFamilyCount, a.CitationCount, a.CreatedDate, a.DisplayName
            FROM RankingAuthors r
            INNER JOIN Authors a ON r.AuthorId = a.AuthorId
        """
        
        ranking_named = mag.query_sql(query)
        ranking_named_df = ranking_named.toPandas()
        
        named_df = pd.merge(df, ranking_named_df, left_on='AuthorId', right_on='AuthorId', how='inner')
        
        new_path = fpath.split(".csv")[0] + 'Named.csv'
        named_df.to_csv(new_path, sep="\t", index=False)
        
        print("Named ranking stored to {}".format(new_path))
        
        if not os.path.exists(base_folder + 'TOP{}'.format(top_n)):
            os.mkdir(base_folder + 'TOP{}'.format(top_n))
        
        head = named_df.sort_values(by='PageRank', ascending=False).head(top_n)
        
        topn_path = base_folder + 'TOP{}/{}_top{}.csv'.format(top_n, field, top_n)
        head.to_csv(topn_path, index=False)
        
        print("Top {} stored to {}\n\n".format(top_n, topn_path))

In [6]:
append_name(field_mapping, base_folder="/home/laal/MAG/CentralityFairness/", top_n=10000)

Extracting author names for Chemistry
Named ranking stored to /home/laal/MAG/DATA/NETWORKS/SimpleWeightChemistry2020CentralityGenderedNamed.csv
Top 10000 stored to /home/laal/MAG/CentralityFairness/TOP10000/Chemistry_top10000.csv


