# Data Processing
creating coauthor to coauthor edgelist

### import author dataset CSV as Pandas dataframe

In [58]:
import pandas as pd, itertools

author_df = pd.read_csv('data/LitCovid_author_list.csv')

author_df = author_df.iloc[:, 1:]

author_df

Unnamed: 0,Author ID,Full Name,First Name,Middle Name,Last Name,Country,Covid-19 Publications,Other Coronavirus Publications,Total Publications on Coronavirus,betweenness,centrality
0,88687508,"Antinori, A",A,,Antinori,Italy,32950740,,1,0.000000,0.180207
1,88673946,"Vivaldi, Caterina",Caterina,,Vivaldi,Italy,33060148,,1,0.000000,0.169635
2,88666710,"van de Veerdonk, Frank",Frank,,van de Veerdonk,Netherlands,32876697,,1,0.000000,0.194202
3,88642229,"Gupta, Kavita",Kavita,,Gupta,India,33085120,,1,0.000000,0.511628
4,88639284,"Li, Philip",Philip,,Li,China,33026775,,1,0.000000,0.124367
...,...,...,...,...,...,...,...,...,...,...,...
300639,88698228,"Abou-Samra, Abdul-Badi",Abdul-Badi,,Abou-Samra,Qatar,32919838;32949777;33033033;33126120,,4,0.000000,0.114880
300640,88698239,"Ganesh, Aravind",Aravind,,Ganesh,Canada,32817352;33128573,,2,58193.903170,0.152032
300641,88662237,"Heaton, Nicholas",Nicholas,S,Heaton,United States,32353252;32843534;33128895,,3,12429.000000,0.177032
300642,88698398,"Andr�, Nicolas",Nicolas,,Andr�,France,32383827;32618416;32893961;33129040,,4,706.150465,0.177841


### Iterate through author dataframe to create publication dataframe

In [59]:
pub_dict = {}
skip_count = 0

#for index,row in covidAuthorData.head(20000).iterrows(): #this is for processing a small sample of rows
for index, row in author_df.iterrows(): #iterate through author entries in dataframe
    row_pub_list = str(row["Covid-19 Publications"]).split(';') #split multiple values in the Covid Publications column
    for pub_id in row_pub_list: #iterate through the Covid-19 Publications
        if pub_id == 'nan': #skip author rows with no COVID-19 publications
            skip_count += 1
        if pub_id != 'nan': #add other publicatoins 
            if pub_id not in pub_dict.keys(): 
                pub_dict[pub_id] = {} #add PubIds to PublicationDictionary (empty values)
            pub_dict[pub_id][str(row["Author ID"])] = {'Name': row["Full Name"], 'Country' : row["Country"], 'AuthorPubCount': row["Total Publications on Coronavirus"]}
            # create nested Dictionary for authors of publications

#create DataFrame from Pub Dictionary
pub_df = pd.DataFrame({'Publication': pub_dict.keys(), 'Authors': pub_dict.values()})

#create CSV from DataFrame if wishing to store this Data
#pub_df.to_csv('data/pub_df.csv')

display(pub_df)

Unnamed: 0,Publication,Authors
0,32950740,"{'88687508': {'Name': 'Antinori, A', 'Country'..."
1,33060148,"{'88673946': {'Name': 'Vivaldi, Caterina', 'Co..."
2,32876697,"{'88666710': {'Name': 'van de Veerdonk, Frank'..."
3,33085120,"{'88642229': {'Name': 'Gupta, Kavita', 'Countr..."
4,33026775,"{'88639284': {'Name': 'Li, Philip', 'Country':..."
...,...,...
66396,33121609,"{'88694864': {'Name': 'Mark, Anita', 'Country'..."
66397,32997860,"{'88695250': {'Name': 'Magoon, Rohan', 'Countr..."
66398,33012159,"{'88695250': {'Name': 'Magoon, Rohan', 'Countr..."
66399,33128606,"{'88695250': {'Name': 'Magoon, Rohan', 'Countr..."


### iterate through publication dictionary to create edgelist dataframe of coauthor to coauthor collaboration
(and create final node dataframe of authors in network)

In [63]:
author_dict = {}
author_list = []

#iterate through author entries in dataframe
for index, row in pub_df.iterrows():
    # extract dictionary of authors for each pub in dataframe
    coauthor_dict = row['Authors']
    
    # skip publications with only one author
    if len(coauthor_dict.keys()) == 1: pass
    
    # iterate through each author in the pub's dictionary of authors
    else: 
        for coauthor_id in coauthor_dict.keys(): 
            # extract values from internal dictionary
            country = coauthor_dict[coauthor_id].get('Country') 
            name = coauthor_dict[coauthor_id].get('Name')  
            pubtotal = coauthor_dict[coauthor_id].get('AuthorPubCount') 
            # get number of the pub's coauthors, i.e. the pub's team size (to be an edge feature/attribute)
            coauthortotal = len(coauthor_dict.keys()) 


            # nested loop: cycle back through each author of the pub
            # and compare to each other author to record a node-to-node / edge connection
            for author_id in coauthor_dict.keys():
                author_dict_inner = {}
                
                #make sure not comparing author to themselves
                if(coauthor_id == author_id): pass
                else:
                    author_dict[coauthor_id] = {'Name': name, 'Country' : country, 
                                                'Author Publication Count' : pubtotal}
                    author_dict_inner = {'Author ID': author_id, 'Name': name, 
                                         'Country' : country, 'Author Publication Count' : pubtotal, 
                                         'Publication Coauthors' : coauthortotal, 'Coauthor ID': coauthor_id}
                    author_list.append(author_dict_inner)


author_df = pd.DataFrame.from_dict(author_dict, orient='index')
author_df.to_csv('data/author_node_list.csv', index=True)

coauthor_edgelist_df = pd.DataFrame(author_list)
coauthor_edgelist_df.to_csv('data/coauthor_edge_list.csv', index=False)

display(coauthor_edgelist_df)

Unnamed: 0,Author ID,Name,Country,Author Publication Count,Publication Coauthors,Coauthor ID
0,88660066,"Antinori, A",Italy,1,13,88687508
1,88693999,"Antinori, A",Italy,1,13,88687508
2,88696885,"Antinori, A",Italy,1,13,88687508
3,88696886,"Antinori, A",Italy,1,13,88687508
4,88693998,"Antinori, A",Italy,1,13,88687508
...,...,...,...,...,...,...
4615043,88694600,"Wijeratne, Tissa",Sri Lanka,3,2,88698076
4615044,88696152,"Banerjee, Manidipa",India,2,2,88697601
4615045,88697601,"Borkotoky, Subhomoi",India,2,2,88696152
4615046,88698378,"Sass, Nelson",Brazil,1,2,88698475


In [64]:
display(author_df)

Unnamed: 0,Name,Country,Author Publication Count
88687508,"Antinori, A",Italy,1
88660066,"Notari, S",Italy,1
88693999,"Cicalini, Stefania",Italy,2
88696885,"Agrati, Chiara",Italy,12
88696886,"Campioni, Paolo",Italy,2
...,...,...,...
88698404,"Richardson, Candice",Canada,1
88698369,"Phillips, Suzanne",Canada,1
88693867,"Patton, Megan",Canada,1
88698475,"Sass, Nelson",Brazil,1
