# Network Analysis of Publications on Studies of Parkinson Disease

In [2]:
import pandas as pd
import ast
from ast import literal_eval
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from matplotlib import rcParams
warnings.filterwarnings('ignore')

In [3]:
def read_data(path, sep):
    df = pd.read_csv(path, sep=sep)
    df.loc[:, 'AA'] = df['AA'].apply(lambda x: ast.literal_eval(x))
    df.loc[:, 'W'] = df['W'].apply(lambda x: ast.literal_eval(x))
    return df

In [4]:
with_doi_annotation_journals = read_data('with_doi_annotation_journals.csv', sep = ';')

In [5]:
with_doi_annotation_journals['RId'].fillna('[]', inplace = True)
with_doi_annotation_journals.loc[:, 'RId'] = with_doi_annotation_journals['RId'].apply(lambda x: ast.literal_eval(x))

## Preprocessing

After more detailed analysys were found 115 papers about Danon disease and one of its symptoms Wolff-Parkinson-White syndrome. This articles were excluded from data

In [6]:
danon_ind = []
for i in range(with_doi_annotation_journals.shape[0]):
    if ((('wolff' in with_doi_annotation_journals.iloc[i]['AW']) and ('parkinson' in with_doi_annotation_journals.iloc[i]['AW'])) or 
        ('danon' in with_doi_annotation_journals.iloc[i]['AW']) or 
        (('wolff' in with_doi_annotation_journals.iloc[i]['W']) and ('parkinson' in with_doi_annotation_journals.iloc[i]['W'])) or 
        ('danon' in with_doi_annotation_journals.iloc[i]['W']) or 
        ('wpw' in with_doi_annotation_journals.iloc[i]['W']) or 
        ('wpw' in with_doi_annotation_journals.iloc[i]['AW']) or
        ('wpws' in with_doi_annotation_journals.iloc[i]['W']) or 
        ('wpws' in with_doi_annotation_journals.iloc[i]['AW'])):
        danon_ind.append(i)

ind = list(set(np.arange(0, with_doi_annotation_journals.shape[0])) - set(danon_ind))

In [31]:
print('The number of articles without WPW-syndrome:', dataset.shape[0])

The number of articles without WPW-syndrome: 45825


In [7]:
with_doi_annotation_journals.iloc[danon_ind].to_csv('wpw.csv', sep = ';')
dataset = with_doi_annotation_journals.iloc[ind]
dataset.to_csv('dataset.csv', sep = ';')

## Network

In [8]:
net = []
i = 0
for i in range(dataset.shape[0]):
    paper_id = dataset.iloc[i]['Id']
    refs = dataset.iloc[i]['RId']
    if len(refs) != 0:
        for ref in refs:
            net.append([paper_id, ref])
net = pd.DataFrame(net)
net.rename(columns = {0: 'ID1', 1: 'ID2'}, inplace = True)

In [9]:
net

Unnamed: 0,ID1,ID2
0,2754967293,2159011576
1,2754967293,2751884637
2,2754967293,2125065061
3,2754967293,2168630917
4,2754967293,2614986146
...,...,...
2488338,2359940301,2083278876
2488339,2359940301,2025616544
2488340,2359940301,2049088811
2488341,2359940301,1978301967


In [15]:
net.to_csv('net.csv', index = False, sep = ';')

In [39]:
print('The number of vertices:', len(set(net['ID1'].values) | set(net['ID2'].values)))

The number of vertices: 780796


Excluding papers that refer to ones not from our final set 

In [40]:
ids = pd.unique(dataset['Id']) # all Id
excluded_net = net.copy()
excluded_net = excluded_net.loc[excluded_net['ID2'].isin(ids)] # retain only from dataset

In [41]:
excluded_net.reset_index(drop = True, inplace = True)
excluded_net

Unnamed: 0,ID1,ID2
0,2754967293,2901519529
1,2754967293,2339791932
2,2112455323,1920030402
3,2177834950,2123627348
4,2558041282,1934236512
...,...,...
312039,2895767795,1967224666
312040,2895767795,2274550138
312041,2895767795,2617488731
312042,2619593042,2545724250


In [42]:
vert_filtered = set(excluded_net['ID1'].values) | set(excluded_net['ID2'].values)
print('The number of vertices:', len(vert_filtered))

The number of vertices: 39825


Year of publication of the paper with ID1 must not be less than the date of publication of the paper with ID2. Such edges were also deleted from the network

In [43]:
net_year = excluded_net.merge(dataset, how = 'left', left_on = 'ID1', right_on = 'Id')[['ID1', 'ID2', 'Y']].rename(columns = {'Y':'Y1'})
net_year = net_year.merge(dataset, how = 'left', left_on = 'ID2', right_on = 'Id')[['ID1', 'ID2', 'Y1', 'Y']].rename(columns = {'Y':'Y2'})

In [44]:
net_year

Unnamed: 0,ID1,ID2,Y1,Y2
0,2754967293,2901519529,2017,2018
1,2754967293,2339791932,2017,2016
2,2112455323,1920030402,2015,2015
3,2177834950,2123627348,2015,2015
4,2558041282,1934236512,2016,2015
...,...,...,...,...
312039,2895767795,1967224666,2019,2015
312040,2895767795,2274550138,2019,2015
312041,2895767795,2617488731,2019,2017
312042,2619593042,2545724250,2017,2016


In [45]:
drop_ind = []
for i in range(len(net_year)):
    if net_year.iloc[i]['Y1'] < net_year.iloc[i]['Y2']: # edge from past to future
        drop_ind.append(i)

In [46]:
len(drop_ind)

1215

In [47]:
ind = list(set(np.arange(0, len(net_year))) - set(drop_ind)) 

In [48]:
edges_to_drop = net_year.iloc[drop_ind][['ID1', 'ID2']]  # drop edges from past to future
edges_to_drop.loc[:, 'weight'] = 1
edges_to_drop = edges_to_drop.merge(dataset, how = 'left', left_on = 'ID1', right_on = 'Id')[['ID1', 'ID2', 'weight', 'Y']]
edges_to_drop.to_csv('edges_to_drop.csv', sep = ';', index = False)

In [49]:
new_excluded_net = net_year.iloc[ind][['ID1', 'ID2']] # filtered net
new_excluded_net.loc[:, 'weight'] = 1
new_excluded_net = new_excluded_net.merge(dataset, how = 'left', left_on = 'ID1', right_on = 'Id')[['ID1', 'ID2', 'weight', 'Y']]
new_vert_filtered = set(new_excluded_net['ID1'].values) | set(new_excluded_net['ID2'].values)

In [50]:
vert_filtered = new_vert_filtered
excluded_net = new_excluded_net
pd.DataFrame(list(vert_filtered)).to_csv('vert_filtered.csv', index = False) # vetrices
excluded_net.to_csv('excluded_net.csv', index = False, sep = ';') # net

In [51]:
filtered_data = dataset[dataset['Id'].isin(list(vert_filtered))] # information about vertices
filtered_data.to_csv('filtered_data.csv', index = False, sep = ';')

In [52]:
excluded_net

Unnamed: 0,ID1,ID2,weight,Y
0,2754967293,2339791932,1,2017
1,2112455323,1920030402,1,2015
2,2177834950,2123627348,1,2015
3,2558041282,1934236512,1,2016
4,2558041282,1947901277,1,2016
...,...,...,...,...
310824,2895767795,1967224666,1,2019
310825,2895767795,2274550138,1,2019
310826,2895767795,2617488731,1,2019
310827,2619593042,2545724250,1,2017


In [53]:
print('The number of vertices:', len(vert_filtered))

The number of vertices: 39811
