In [1]:
import pandas as pd
import numpy as np

In [2]:
cancer = pd.read_csv('../data/raw/string_cancer.tsv', sep='\t')
pre = pd.read_csv('../data/raw/string_preeclampsia.tsv', sep='\t')


rename_dict = {'#node1': 'node1', 'node1_string_id':'node1_id', 'node2_string_id':'node2_id',}
cancer = cancer.rename(columns=rename_dict)
pre = pre.rename(columns=rename_dict)

In [3]:
# load a xls file as pandas and get a specific tab
xls = pd.ExcelFile('../data/raw/proteomica.xlsx')
xls_cancer = pd.read_excel(xls, 'Cancer-Rat')
xls_pre = pd.read_excel(xls, 'Preeclampsia-Rat')
xls_cancer

Unnamed: 0,Query protein,Gene name,FC
0,Q6AYZ1,3,-04403701467
1,F1LZK3,10090,1206086919
2,Q6AY56,A1bg,-07781455102
3,Q4FZY3,A2m,-07111270661
4,G3V7Q7,Abcb1b,-05573217453
...,...,...,...
173,D4ABI6,Uchl4,-06349663524
174,Q91Y81,Vapa,-08387873963
175,D4A7L6,Vat1,-05113878815
176,A0A0G2K1E2,Vcl,-03677909569


In [4]:
xls_cancer.columns, xls_pre.columns

(Index(['Query protein', 'Gene name', 'FC'], dtype='object'),
 Index(['Protein (UNIPROT)', 'protein_firstname', 'Protein description',
        'Gene name', 'P value', 'Regulated', 'Fold change', 'Obs.'],
       dtype='object'))

In [5]:
xls_cancer = xls_cancer.rename(columns={'Gene name': 'gene', 'FC': 'fc'})
xls_cancer = xls_cancer.drop('Query protein', axis=1)
xls_cancer['fc'] = xls_cancer['fc'].str.replace(',', '.').astype(float)
xls_cancer['regulated'] = np.where(xls_cancer['fc'] > 0, 'up', 'down')
xls_cancer['cancer'] = 1
xls_cancer.columns

Index(['gene', 'fc', 'regulated', 'cancer'], dtype='object')

In [6]:
xls_pre = xls_pre.drop(['Protein (UNIPROT)', 'protein_firstname', 'Protein description','P value','Obs.'], axis=1)
xls_pre = xls_pre.rename(columns={'Gene name': 'gene', 'Fold change': 'fc', 'Regulated': 'regulated'})
xls_pre['preeclampsia'] = 1
xls_pre['regulated'] = xls_pre['regulated'].str.lower()
# xls_pre['fc'] = xls_pre['fc'].str.replace(',', '.').astype(float)
xls_pre.columns

Index(['gene', 'regulated', 'fc', 'preeclampsia'], dtype='object')

Number of unique proteins in node1 and node2

In [7]:
cancer.node1.nunique(), cancer.node2.nunique()

(79, 80)

Genes that are present only in node1 or node2

In [8]:
diff = set(cancer.node1.unique()).symmetric_difference(set(cancer.node2.unique()))
len(diff)

63

Number of unique proteins(gene?)

In [9]:
len(set(cancer.node1.unique().tolist() + cancer.node2.unique().tolist()))

111

In [10]:
len(set(pre.node1.unique().tolist() + pre.node2.unique().tolist()))

103

In [11]:
f_df = xls_cancer.merge(xls_pre, on='gene', suffixes=('_cancer', '_pre'), how='outer')
f_df.cancer = f_df.cancer.fillna(0)
f_df.preeclampsia = f_df.preeclampsia.fillna(0)
f_df.to_csv('../data/processed/genes.csv', index=False)
f_df

Unnamed: 0,gene,fc_cancer,regulated_cancer,cancer,regulated_pre,fc_pre,preeclampsia
0,3,-0.440370,down,1.0,,,0.0
1,10090,1.206087,up,1.0,,,0.0
2,A1bg,-0.778146,down,1.0,,,0.0
3,A2m,-0.711127,down,1.0,,,0.0
4,Abcb1b,-0.557322,down,1.0,,,0.0
...,...,...,...,...,...,...,...
487,Casp6,,,0.0,up,0.559243,1.0
488,Gar1,,,0.0,up,0.580970,1.0
489,,,,0.0,up,0.763275,1.0
490,Pappa2,,,0.0,up,0.811140,1.0


In [12]:
df_no_nans = f_df.dropna()
print(len(df_no_nans))
df_no_nans.sort_values('regulated_cancer', ascending=False)

12


Unnamed: 0,gene,fc_cancer,regulated_cancer,cancer,regulated_pre,fc_pre,preeclampsia
10,Actn1,0.240738,up,1.0,up,0.263762,1.0
53,Ehd4,0.576265,up,1.0,down,-0.293523,1.0
77,Hpx,0.44436,up,1.0,up,0.555551,1.0
171,Uba6,0.192763,up,1.0,down,-0.34656,1.0
49,Dnajb11,-0.428988,down,1.0,up,0.442867,1.0
52,Ehd1,-0.450791,down,1.0,down,-0.294619,1.0
61,F2,-0.553859,down,1.0,up,0.293442,1.0
90,Lgals1,-0.287505,down,1.0,up,0.315815,1.0
119,Prdx5,-0.360268,down,1.0,down,-0.353523,1.0
143,Ruvbl2,-0.296228,down,1.0,up,0.217608,1.0


In [13]:
cancer.to_csv('../data/processed/string_cancer.csv', index=False)
pre.to_csv('../data/processed/string_pre.csv', index=False)