In [3]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

In [4]:
%matplotlib inline

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 1. Loading the two outputs generated by GENIE3 and GRNBoost2

In [6]:
genie3_df    = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/Final_Datasets/GENIE3_Ranking_TF_TG.csv', sep=',')

##2. Printing dimensions and top ranked genes of each produced output

### i. GENIE3

In [None]:
genie3_df = genie3_df.iloc[:, 3:6]
score_col = genie3_df.pop('score')
genie3_df.insert(2, 'score', round(score_col,5))

In [None]:
genie3_df.head(10)

Unnamed: 0,TF,TG,score
0,dmrt93B,CG14932,0.85471
1,dmrt93B,Gr63a,0.84326
2,dmrt93B,CG14955,0.83637
3,dmrt93B,Hsp70Aa,0.83425
4,dmrt93B,Hsp70Bc,0.83026
5,GATAe,PH4alphaPV,0.82495
6,fkh,MRE23,0.82274
7,shn,CG14471,0.81887
8,dmrt93B,Hsp70Bbb,0.81283
9,CG13510,CG13511,0.79432


In [None]:
genie3_df = genie3_df[(genie3_df != 0).all(1)]

In [None]:
genie3_df_filtered = genie3_df[(genie3_df[['score']]>=0.50).all(axis=1)]

In [None]:
print(len(genie3_df_filtered))
print(genie3_df_filtered['TF'].value_counts())

210
TF
GATAe      45
dmrt93B    44
CG30431    18
srp         9
trh         9
           ..
CG13894     1
cnc         1
maf-S       1
Rel         1
Cf2         1
Name: count, Length: 62, dtype: int64


In [None]:
len(genie3_df_filtered)

210

In [None]:
genie3_df_filtered.head(15)

Unnamed: 0,TF,TG,score
0,dmrt93B,CG14932,0.85471
1,dmrt93B,Gr63a,0.84326
2,dmrt93B,CG14955,0.83637
3,dmrt93B,Hsp70Aa,0.83425
4,dmrt93B,Hsp70Bc,0.83026
5,GATAe,PH4alphaPV,0.82495
6,fkh,MRE23,0.82274
7,shn,CG14471,0.81887
8,dmrt93B,Hsp70Bbb,0.81283
9,CG13510,CG13511,0.79432


In [None]:
genie3_df_filtered.to_csv('/content/drive/MyDrive/Colab_Notebooks/Final_Datasets/Outputs/genie3_GRN_40.csv')

In [None]:
genie3_df_filtered.to_csv('/content/drive/MyDrive/Colab_Notebooks/Final_Datasets/Outputs/genie3_GRN_50.csv')

### Transcription Factors with most regulators

i. within

### ii. GRNBOOST2

In [7]:
GRNBoost2_df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/Final_Datasets/GRNBoost2_network.tsv', sep='\t', names=['TF', 'TG', 'Score'], header = None)

In [None]:
GRNBoost2_df['Score'] = GRNBoost2_df['Score']/100.0

In [None]:
GRNBoost2_df.head(10)

Unnamed: 0,TF,TG,Score
0,TFAM,exo70,0.833288
1,Tsf2,CG9634,0.67176
2,Atf-2,CG32815,0.663
3,her,CG8613,0.647054
4,cnc,CG15099,0.612437
5,ems,Hsp60,0.612328
6,CG1024,vir,0.611803
7,CG12219,wdn,0.610983
8,MBD-like,CG9797,0.600319
9,B-H2,so,0.598897


In [None]:
grn_filtered = GRNBoost2_df[(GRNBoost2_df[['Score']]>=0.50).all(axis=1)]

In [None]:
print(len(grn_filtered))
print(grn_filtered['TF'].value_counts())

138
TF
TFAM       5
CG2116     5
CG11902    4
CG10979    3
crol       3
          ..
CG32772    1
Dad        1
awd        1
CG13510    1
CG2199     1
Name: count, Length: 103, dtype: int64


In [None]:
grn_filtered

Unnamed: 0,TF,TG,Score
0,TFAM,exo70,0.833288
1,Tsf2,CG9634,0.671760
2,Atf-2,CG32815,0.663000
3,her,CG8613,0.647054
4,cnc,CG15099,0.612437
...,...,...,...
9813,l(2)37Cc,CG6020,0.250020
9814,Jarid2,SRPK,0.250017
9815,mld,CG42671,0.250017
9816,CG9215,Taf8,0.250013


In [None]:
import matplotlib.pyplot as plt
import plotly.express as px

grn_filtered_count = grn_filtered.groupby(['TF'])['TF'].count().reset_index(name='count')
grn_filtered_count = grn_filtered_count.sort_values(by=['count'],ascending=False).reset_index()
grn_filtered_count.head()

Unnamed: 0,index,TF,count
0,23,CG2116,5
1,63,TFAM,5
2,12,CG11902,4
3,79,crol,3
4,10,CG10979,3


In [None]:
fig = px.pie(grn_filtered_count[:5], values='count', names='TF', title='Most connected Transcription Factors')
fig.show()

## 3. Gene ranking dataset created from overlapping links between GENIE3 and GRNBoost2 outputs

- The resulting dataset will be termed as 'BioGRN'

In [8]:
grn_final = pd.merge(genie3_df, GRNBoost2_df, how='inner', on=['TF','TG'])

In [None]:
grn_final_filtered = pd.merge(genie3_df_filtered, grn_filtered, how='inner', on=['TF','TG'])

In [9]:
grn_final = grn_final.rename(columns={"score": "genie3_score", "Score": "GRNBoost2_score"})

In [10]:
grn_final.head(10)

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,rank,genie3_score,TF,TG,GRNBoost2_score
0,dmrt93B,CG14932,1.0,0.854713,dmrt93B,CG14932,35.982913
1,dmrt93B,Gr63a,2.0,0.843259,dmrt93B,Gr63a,42.867362
2,dmrt93B,CG14955,3.0,0.836367,dmrt93B,CG14955,26.452926
3,dmrt93B,Hsp70Aa,4.0,0.834249,dmrt93B,Hsp70Aa,35.178831
4,dmrt93B,Hsp70Bc,5.0,0.830264,dmrt93B,Hsp70Bc,34.699198
5,GATAe,PH4alphaPV,6.0,0.824947,GATAe,PH4alphaPV,24.080847
6,fkh,MRE23,7.0,0.822737,fkh,MRE23,40.866506
7,shn,CG14471,8.0,0.81887,shn,CG14471,41.174992
8,dmrt93B,Hsp70Bbb,9.0,0.812825,dmrt93B,Hsp70Bbb,24.960379
9,CG13510,CG13511,10.0,0.794322,CG13510,CG13511,55.063971


In [11]:
grn_final['Average Score'] = (grn_final['genie3_score']+grn_final['GRNBoost2_score'])/2
grn_final['Score difference'] = abs(grn_final['genie3_score']-grn_final['GRNBoost2_score'])

In [12]:
grn_final.head(100)

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,rank,genie3_score,TF,TG,GRNBoost2_score,Average Score,Score difference
0,dmrt93B,CG14932,1.0,0.854713,dmrt93B,CG14932,35.982913,18.418813,35.128199
1,dmrt93B,Gr63a,2.0,0.843259,dmrt93B,Gr63a,42.867362,21.855310,42.024103
2,dmrt93B,CG14955,3.0,0.836367,dmrt93B,CG14955,26.452926,13.644647,25.616559
3,dmrt93B,Hsp70Aa,4.0,0.834249,dmrt93B,Hsp70Aa,35.178831,18.006540,34.344581
4,dmrt93B,Hsp70Bc,5.0,0.830264,dmrt93B,Hsp70Bc,34.699198,17.764731,33.868934
...,...,...,...,...,...,...,...,...,...
95,dmrt93B,Hsp68,96.0,0.593958,dmrt93B,Hsp68,30.602970,15.598464,30.009012
96,sage,CG43084,97.0,0.593677,sage,CG43084,18.570494,9.582086,17.976817
97,GATAe,CG15818,98.0,0.593651,GATAe,CG15818,20.187727,10.390689,19.594076
98,tx,CG9194,99.0,0.592643,tx,CG9194,11.156582,5.874612,10.563939


In [None]:
grn_final_sort = grn_final.sort_values(by=['Average Score'],ascending=False).reset_index()
grn_final_sort.head(10)

Unnamed: 0,index,TF,TG,genie3_score,GRNBoost2_score,Average Score,Score difference
0,9,CG13510,CG13511,0.79432,0.55064,0.67248,0.24368
1,22,dmrt93B,CG13138,0.72797,0.588344,0.658157,0.139626
2,19,salm,salr,0.74189,0.540694,0.641292,0.201196
3,1,dmrt93B,Gr63a,0.84326,0.428674,0.635967,0.414586
4,6,fkh,MRE23,0.82274,0.408665,0.615703,0.414075
5,7,shn,CG14471,0.81887,0.41175,0.61531,0.40712
6,0,dmrt93B,CG14932,0.85471,0.359829,0.60727,0.494881
7,36,Jra,CG14207,0.6848,0.524429,0.604615,0.160371
8,10,srp,CG30046,0.78697,0.399444,0.593207,0.387526
9,3,dmrt93B,Hsp70Aa,0.83425,0.351788,0.593019,0.482462


In [None]:
grn_final_filtered = grn_final[(grn_final[['Average Score']]>=0.50).all(axis=1)]

In [None]:
grn_final_filtered

Unnamed: 0,TF,TG,genie3_score,GRNBoost2_score,Average Score,Score difference
0,dmrt93B,CG14932,0.85471,0.359829,0.60727,0.24744
1,dmrt93B,Gr63a,0.84326,0.428674,0.635967,0.207293
2,dmrt93B,CG14955,0.83637,0.264529,0.55045,0.28592
3,dmrt93B,Hsp70Aa,0.83425,0.351788,0.593019,0.241231
4,dmrt93B,Hsp70Bc,0.83026,0.346992,0.588626,0.241634
5,GATAe,PH4alphaPV,0.82495,0.240808,0.532879,0.292071
6,fkh,MRE23,0.82274,0.408665,0.615703,0.207037
7,shn,CG14471,0.81887,0.41175,0.61531,0.20356
8,dmrt93B,Hsp70Bbb,0.81283,0.249604,0.531217,0.281613
9,CG13510,CG13511,0.79432,0.55064,0.67248,0.12184


In [None]:
grn_final_filtered.head()

Unnamed: 0,TF,TG,score,Score
0,dmrt93B,Gr63a,0.84326,42.867362
1,fkh,MRE23,0.82274,40.866506
2,shn,CG14471,0.81887,41.174992
3,CG13510,CG13511,0.79432,55.063971
4,salm,salr,0.74189,54.069407


In [None]:
grn_final.to_csv('/content/drive/MyDrive/Colab_Notebooks/Final_Datasets/Outputs/BioGRN_All.csv')

In [None]:
grn_filtered.to_csv('/content/drive/MyDrive/Colab_Notebooks/Final_Datasets/Outputs/GRNBoost_50.csv')

In [None]:
grn_final_filtered.to_csv('/content/drive/MyDrive/Colab_Notebooks/Final_Datasets/Outputs/grn_final_50.csv')

In [None]:
print(len(grn_final_filtered))
print(grn_final_filtered['TF'].value_counts())

30
TF
dmrt93B    12
GATAe       3
CG13510     2
fkh         1
shn         1
srp         1
salm        1
kn          1
sage        1
bap         1
CG30269     1
Jra         1
tll         1
CG3281      1
Zif         1
trh         1
Name: count, dtype: int64


### Loading TFLink dataset and comparing it with the bioGRN dataset for rough accuracy

- Accuracy: The number of correctly predicted interaction links between each gene
 A caveat of this comparison is that many predictions that dont exist in TFLink may or may not be valid, undiscovered links as well. More literature an research is definitely needed to study regulatory gene-gene interactions.

In [13]:
bioGRN    = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/Final_Datasets/Outputs/BioGRN_All.csv', sep=',')
tfLink_df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/Final_Datasets/Outputs/TFLINK_Drosophila.tsv', sep='\t')

In [14]:
tfLink_df.head()

Unnamed: 0,UniprotID.TF,UniprotID.Target,NCBI.GeneID.TF,NCBI.GeneID.Target,Name.TF,Name.Target,Detection.method,PubmedID,Organism,Source.database,Small-scale.evidence,TF.TFLink.ortho,TF.nonTFLink.ortho,Target.TFLink.ortho,Target.nonTFLink.ortho
0,P02836,P02836,36240,36240,en,en,DNase I footprinting;chromatin immunoprecipita...,2895896;26578589;20965965;27924024;2573829,Drosophila melanogaster,GTRD;ORegAnno;REDfly,Yes,-,-,-,-
1,O61735,P49021,38872,33571,Clk,tim,chromatin immunoprecipitation assay;experiment...,9616122;20965965;27924024,Drosophila melanogaster,GTRD;REDfly,Yes,Dr:Q5RIV1;Rn:F1LRL5,-,-,-
2,Q9VN10,P39770,40549,34569,hkb,salm,DNase I footprinting;inferred by curator,20965965;9376314;26578589,Drosophila melanogaster,ORegAnno;REDfly,Yes,-,-,Hs:Q9BXA9,Dr:A0A1D5NSE4;Mm:A0A5F8MPC9
3,Q9XTP7,A1Z877,41265,36089,jumu,Ndg,electrophoretic mobility shift assay,20965965;22378636,Drosophila melanogaster,REDfly,Yes,-,-,Hs:Q14112;Hs:P14543;Ce:Q93791;Dr:A0A0G2L836;Mm...,Dr:X1WC52;Dr:Q5RI93
4,P02836,P07713,36240,33432,en,dpp,DNase I footprinting;chromatin immunoprecipita...,7713429;20965965;26578589;27924024,Drosophila melanogaster,GTRD;ORegAnno;REDfly,Yes,-,-,Dr:O93369,-


In [16]:
tfLink_df2 = tfLink_df.iloc[:,4:6]

In [17]:
tfLink_df2.shape

(368629, 2)

In [18]:
bioGRN.shape

(534843, 7)

In [19]:
tfLink_df2 = tfLink_df2.rename(columns={"Name.TF": "TF", "Name.Target": "TG"})

In [20]:
tfLink_df2.head()

Unnamed: 0,TF,TG
0,en,en
1,Clk,tim
2,hkb,salm
3,jumu,Ndg
4,en,dpp


In [21]:
'trh' in tfLink_df2['TF']

False

In [22]:
grn_validate = pd.merge(bioGRN, tfLink_df2, how='inner', on=['TF','TG'])
grn_validate = grn_validate.iloc[:, 1:]

In [23]:
bioGRN['validation']= bioGRN[['TF','TG']].apply(tuple, axis=1)\
                  .isin(tfLink_df2[['TF','TG']].apply(tuple, axis=1))
bioGRN.head()

Unnamed: 0.1,Unnamed: 0,TF,TG,genie3_score,GRNBoost2_score,Average Score,Score difference,validation
0,0,dmrt93B,CG14932,0.85471,0.359829,0.60727,0.494881,False
1,1,dmrt93B,Gr63a,0.84326,0.428674,0.635967,0.414586,False
2,2,dmrt93B,CG14955,0.83637,0.264529,0.55045,0.571841,False
3,3,dmrt93B,Hsp70Aa,0.83425,0.351788,0.593019,0.482462,False
4,4,dmrt93B,Hsp70Bc,0.83026,0.346992,0.588626,0.483268,False


In [26]:
grn_validate

Unnamed: 0,TF,TG,genie3_score,GRNBoost2_score,Average Score,Score difference
0,trh,btl,0.53280,0.179085,0.355942,0.353715
1,srp,eater,0.42477,0.341123,0.382946,0.083647
2,grh,ed,0.40212,0.390755,0.396437,0.011365
3,grh,Cad86C,0.37534,0.263124,0.319232,0.112216
4,ato,sens,0.35251,0.325354,0.338932,0.027156
...,...,...,...,...,...,...
3698,pho,pnr,0.00002,0.000213,0.000116,0.000193
3699,grh,yellow-e2,0.00002,0.002221,0.001121,0.002201
3700,dl,Ubx,0.00002,0.000855,0.000438,0.000835
3701,pho,MED7,0.00001,0.000698,0.000354,0.000688


In [27]:
grn_val_sort = grn_validate.sort_values(by=['Average Score'],ascending=False).reset_index()
grn_val_sort.head(10)

Unnamed: 0,index,TF,TG,genie3_score,GRNBoost2_score,Average Score,Score difference
0,2,grh,ed,0.40212,0.390755,0.396437,0.011365
1,9,Dref,mts,0.31779,0.450414,0.384102,0.132624
2,1,srp,eater,0.42477,0.341123,0.382946,0.083647
3,6,ab,ko,0.3426,0.418111,0.380356,0.075511
4,0,trh,btl,0.5328,0.179085,0.355942,0.353715
5,15,pho,Tsp96F,0.27011,0.440703,0.355407,0.170593
6,5,grh,uif,0.34386,0.365074,0.354467,0.021214
7,4,ato,sens,0.35251,0.325354,0.338932,0.027156
8,7,so,betaTub60D,0.33838,0.338161,0.338271,0.000219
9,48,so,Mmp2,0.15982,0.481669,0.320744,0.321849


In [28]:
len(grn_val_sort)

3703

In [None]:
## GENIE3 and TFLINK validation
genie3_val = pd.merge(genie3_df, tfLink_df2, how='inner', on=['TF','TG'])

In [None]:
genie3_val_sort = genie3_val.sort_values(by=['score'],ascending=False).reset_index()
genie3_val_sort.head(10)

Unnamed: 0,index,TF,TG,score
0,0,trh,btl,0.5328
1,1,srp,eater,0.42477
2,2,grh,ed,0.40212
3,3,grh,Cad86C,0.37534
4,4,ato,sens,0.35251
5,5,grh,uif,0.34386
6,6,ab,ko,0.3426
7,7,so,betaTub60D,0.33838
8,8,twi,meso18E,0.32094
9,9,Dref,mts,0.31779


In [None]:
## GRNBoost2 TFLink Validation
grnBoost2_val = pd.merge(GRNBoost2_df, tfLink_df2, how='inner', on=['TF','TG'])
grnBoost2_val_sort = grnBoost2_val.sort_values(by=['Score'],ascending=False).reset_index()

In [None]:
grnBoost2_val_sort.head(10)

Unnamed: 0,index,TF,TG,Score
0,0,so,fw,0.502905
1,1,so,Mmp2,0.481669
2,2,Cp190,Su(z)12,0.465833
3,3,chn,px,0.451328
4,4,Dref,mts,0.450414
5,5,pho,Tsp96F,0.440703
6,6,Dref,Tap42,0.438787
7,7,Dref,sina,0.421007
8,8,Cp190,ssx,0.420298
9,9,ab,ko,0.418111


### Comparison between TFLink and BioGRN datasets

In [None]:
print('BioGRN Validated TFs: ', len(grn_val_sort['TF'].value_counts()))
print('BioGRN Validated Links: ', len(grn_val_sort))
print('Total BioGRN generated predictions: ', len(bioGRN))

biogrn_accuracy = round(len(grn_val_sort)/len(bioGRN) * 100, 2)
print('Validation Accuracy: ', biogrn_accuracy, ' %')

BioGRN Validated TFs:  131
BioGRN Validated Links:  3703
Total BioGRN generated predictions:  534843
Validation Accuracy:  0.69  %


In [None]:
print('GENIE3 Validated TFs: ', len(genie3_val_sort['TF'].value_counts()))
print('GENIE3 Validated Links: ', len(genie3_val_sort))
print('Total GENIE3 generated predictions: ', len(genie3_df))

genie3_accuracy = round(len(genie3_val_sort)/len(genie3_df) * 100, 2)
print('Validation Accuracy: ', genie3_accuracy, ' %')

GENIE3 Validated TFs:  132
GENIE3 Validated Links:  3846
Total GENIE3 generated predictions:  579908
Validation Accuracy:  0.66  %


In [None]:
print('GRNBoost2 Validated TFs: ', len(grnBoost2_val_sort['TF'].value_counts()))
print('GRNBoost2 Validated Links: ', len(grnBoost2_val_sort))
print('Total GRNBoost2 generated predictions: ', len(GRNBoost2_df))

grnBoost2_accuracy = round(len(grnBoost2_val_sort)/len(GRNBoost2_df) * 100, 2)
print('Validation Accuracy: ', grnBoost2_accuracy, ' %')

GRNBoost2 Validated TFs:  211
GRNBoost2 Validated Links:  28957
Total GRNBoost2 generated predictions:  4879738
Validation Accuracy:  0.59  %


In [None]:
# grn_val_sort.iloc[0:21,:].to_csv('/content/drive/MyDrive/Colab_Notebooks/Final_Datasets/Outputs/grn_val_sort_20.csv')

In [31]:
# grn_val_sort.to_excel('/content/drive/MyDrive/Colab_Notebooks/Final_Datasets/Outputs/validated_regulatory_links.xlsx')

In [None]:
# genie3_val_sort.to_csv('/content/drive/MyDrive/Colab_Notebooks/Final_Datasets/Outputs/genie3_val.csv')
# grnBoost2_val_sort.to_csv('/content/drive/MyDrive/Colab_Notebooks/Final_Datasets/Outputs/grnBoost2_val.csv')

In [None]:
# genie3_val_sort.iloc[0:21,:].to_csv('/content/drive/MyDrive/Colab_Notebooks/Final_Datasets/Outputs/genie3_val_20.csv')
# grnBoost2_val_sort.iloc[0:21,:].to_csv('/content/drive/MyDrive/Colab_Notebooks/Final_Datasets/Outputs/grnBoost2_val_20.csv')

In [None]:
# bioGRN.to_csv('/content/drive/MyDrive/Colab_Notebooks/Final_Datasets/Outputs/bioGRN_validate.csv')

In [None]:
# bioGRN    = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/Final_Datasets/Outputs/BioGRN_All.csv', sep=',')
# tfLink_df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/Final_Datasets/Outputs/TFLINK_Drosophila.tsv', sep='\t')
# tfLink_df2 = tfLink_df.iloc[:,4:6]