In [1]:
import numpy as np
import pandas as pd
import scipy

from IPython.display import clear_output

import sys
sys.path.append('../../../../Documents/GitHub/gustav/src/')

from gustav import ebi, ncbi, nlm, biogrid, nih
from gustav import publications
from gustav import github
from gustav import access_framework
from gustav import mapper

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append('../general/src/')
from manuscript import export
from manuscript import inout
from manuscript import datasets
from manuscript import tools

from sklearn.metrics import auc
from scipy.stats import fisher_exact
pd.options.display.precision = 3
pd.options.display.expand_frame_repr = False
pd.options.display.max_columns = 20



In [2]:
taxon = 9606
gene_flavor = 'ou'
ref_genes = datasets.reference_genes(taxon, gene_flavor)
gene_info = ncbi.gene_info(taxon, mode='unambiguous_ensembl')

  df_with_delimiter.drop(column, 1).reset_index(),
  joined = joined.drop('helper_index', 1)
  df_with_delimiter.drop(column, 1).reset_index(),
  joined = joined.drop('helper_index', 1)


In [3]:
ref_literature = datasets.reference_publications(taxon)

In [4]:
agg = []
literature_flavor = 'pubtator_title_or_abstract_in_any_gene2pubmed_paper'

gene2lit = datasets.reference_gene2lit(
    taxon, 
    literature_flavor)
gene2lit = gene2lit[
    gene2lit['gene_ncbi'].isin(ref_genes) & 
    gene2lit['pubmed_id'].isin(ref_literature)
].copy()

In [5]:
%%time
gene2pubmed = gene2lit #ncbi.gene2pubmed(taxon_ncbi=9606)
large_pubs = gene2pubmed['pubmed_id'].value_counts()[gene2pubmed['pubmed_id'].value_counts() >= 100].index.values
#gene2pubmed = gene2pubmed[~gene2pubmed['pubmed_id'].isin(large_pubs)]

gene2pubmed_dict = gene2pubmed[~gene2pubmed['pubmed_id'].isin(large_pubs)].value_counts('gene_ncbi').to_dict()

CPU times: total: 141 ms
Wall time: 137 ms


# EBI-GXA

In [6]:
%%time

taxon = 9606

gene_info = ncbi.gene_info(taxon, mode='unambiguous_ensembl')

ref_genes = gene_info[
    (gene_info['type_of_gene']=='protein-coding') &
    (gene_info['nomenclature_status']=='O')   # Carries an official gene symbol
]['gene_ncbi']

de = ebi.gxa(
    'de_{}_entrez'.format(taxon),
    columns=['comparison_key', 'gene_ncbi', 'p-value', 'log2foldchange']
)

de = de.drop_duplicates(subset=['comparison_key', 'gene_ncbi'], keep='first')

contrasts = ebi.gxa(
    'contrasts',
    columns=['comparison_key', 'experiment']
)

studies = ebi.gxa(
    'studies',
    columns=['experiment', 'qualifier', 'value']
)

study_types = studies[
    studies['qualifier'].str.contains('[AEExperimentType]', regex=False)
].drop_duplicates()
study_types['value'] = study_types['value'].str.strip(' ').str.strip('"')
study_types = study_types[['experiment', 'value']].rename(columns={'value': 'type'})
study_types = study_types.drop_duplicates()

study_types = study_types.drop_duplicates('experiment', keep=False)

study_types = study_types[
    study_types['type'].isin(
        [
           'transcription profiling by array',
           'RNA-seq of coding RNA' 
        ]
    )
].replace(
    {
       'transcription profiling by array': 'array',
       'RNA-seq of coding RNA': 'rna_seq' 
    }
)

main = pd.merge(
    de,
    pd.merge(
        contrasts,
        study_types
    )[['comparison_key', 'type']]
)

main = main[main['gene_ncbi'].isin(ref_genes)]

  df_with_delimiter.drop(column, 1).reset_index(),
  joined = joined.drop('helper_index', 1)


CPU times: total: 21.8 s
Wall time: 22.1 s


In [7]:
studies.loc[(studies['qualifier'] == 'PubMed ID') | 
        (studies['qualifier'] == 'Pubmed ID') | 
        (studies['qualifier'] == 'pubmedID') | 
        (studies['qualifier'] == 'PubMed ID       ') | 
        (studies['qualifier'] == '''"Pubmed ID"''') | 
        (studies['qualifier'] == 'PubMedID') | 
        (studies['qualifier'] == 'PubMed Id') | 
        (studies['qualifier'] == 'PubmedID') |
        (studies['qualifier'] == 'PubMed ID '), ['qualifier']] = 'pubmed_id'

pubmed_ids_df  = studies[studies['qualifier'] == 'pubmed_id']

pubmed_ids_df = pd.merge(contrasts, pubmed_ids_df, how='left').rename(columns={'value':'pubmed_id'})

In [8]:
# total records
pubmed_ids_df['pubmed_id'].value_counts()

28892060    115
24885658     65
31806758     63
17394647     60
23180784     52
           ... 
26891258      1
23327667      1
17420462      1
22383093      1
18156220      1
Name: pubmed_id, Length: 2550, dtype: int64

In [9]:
# human experiments
pubmed_ids_df[pubmed_ids_df['comparison_key'].isin(np.unique(main['comparison_key']))]['pubmed_id'].value_counts()

28892060    110
24885658     65
23935999     31
25314013     31
16478798     24
           ... 
25397880      1
25488971      1
23254957      1
21810943      1
21928115      1
Name: pubmed_id, Length: 976, dtype: int64

In [10]:
%%time
pubmed_ids_df = pubmed_ids_df[(pubmed_ids_df['comparison_key'].isin(np.unique(main['comparison_key']))) & 
              (pubmed_ids_df['pubmed_id'].isin(gene2pubmed['pubmed_id'].astype(str)))].dropna()\

pubmed_ids_df['pubmed_id'] = pubmed_ids_df['pubmed_id'].astype(int)

CPU times: total: 1.02 s
Wall time: 1.02 s


In [11]:
# genes in title/abtract
pubmed_ids_df['pubmed_id'].value_counts()

22833572    22
25968456    10
25475719    10
24971610     8
22970192     7
            ..
20179017     1
23922103     1
23610125     1
23267084     1
24086395     1
Name: pubmed_id, Length: 252, dtype: int64

In [12]:
%%time
fdr_thresh = 0.05
de_cond = (main['p-value'] < fdr_thresh)
new_main = pd.merge(main[de_cond], pubmed_ids_df[['comparison_key', 'pubmed_id']])

CPU times: total: 609 ms
Wall time: 618 ms


In [13]:
# hits in study
new_main['pubmed_id'].drop_duplicates()

0          22991305
6560       23024189
6700       23707782
8316       23824327
9988       23871666
             ...   
1407653    24971610
1415694    26313692
1424187    25648896
1428406    26735015
1432477    23825313
Name: pubmed_id, Length: 250, dtype: int32

In [14]:
#new_main['pubmed_id'].drop_duplicates().to_csv('../data/transcriptomics_pubmed_ids.csv', index=False)

In [15]:
%%time
icite = nih.icite(dataset='citations')

CPU times: total: 20.3 s
Wall time: 23.4 s


In [16]:
def get_genes(main, fdr_thresh = 0.05):
    # apply filters and p-value threshold here
    de_cond = (main['p-value'] <= fdr_thresh)
    gwas = pd.merge(main[de_cond], pubmed_ids_df[['comparison_key', 'pubmed_id']])
    
    prot_genes = set(gene_info[gene_info['type_of_gene'] == 'protein-coding']['gene_ncbi'])
    
    print(str(len(set(gwas['pubmed_id'].values))) + ' transcriptomics articles')
    
    icite_slice = icite[icite['referenced'].isin(gwas['pubmed_id'])].copy()
    icite_slice = pd.merge(icite_slice, gene2pubmed, left_on='citing', right_on='pubmed_id')
    print(str(icite_slice['citing'].nunique()) + ' citing articles')

    de_dict = gwas.groupby('pubmed_id')['gene_ncbi'].apply(set).to_dict()
    de_dict_mentioned_genes = (
        gene2pubmed[gene2pubmed['pubmed_id']
                    .isin(gwas['pubmed_id'])]
        .groupby('pubmed_id')['gene_ncbi']
        .apply(set).to_dict()
    )
    de_dict_citations_mentioned_genes = icite_slice.groupby('referenced')['gene_ncbi'].apply(set).to_dict()

    np.random.seed(49)
    n_samp = 100
    de_sets = []
    de_mentioned_sets = []
    de_mentioned_sets_null = []
    de_citations_mentioned_sets = []
    citations_mentioned_sets = []
    result_df_array = []
    citing_articles_array = []
    for comparison_key in np.unique(gwas['pubmed_id'].values):
        
        ### Collect all sets of unique genes
        de_list = de_dict.get(comparison_key) & prot_genes

        if de_dict_mentioned_genes.get(comparison_key):
            de_list_mentioned_genes = de_dict_mentioned_genes.get(comparison_key) & de_list
        else:
            de_list_mentioned_genes = set()
            
        if de_dict_citations_mentioned_genes.get(comparison_key):
            de_list_citations_mentioned_genes = de_dict_citations_mentioned_genes.get(comparison_key) & de_list
        else:
            de_list_citations_mentioned_genes = set()
            
        citing_articles = []
        citing_articles = icite_slice[(icite_slice['referenced'] == comparison_key) &
                                          icite_slice['gene_ncbi'].isin(de_list)]['citing'].values
        for citing_article in citing_articles:
            citing_articles_array.append(citing_article)

        for n_n in range(n_samp):
            de_mentioned_sets_null.append(set(np.random.choice(list(de_list), 
                                                               replace=False, size=len(de_list_mentioned_genes))))

        de_sets.append(de_list)
        de_mentioned_sets.append(de_list_mentioned_genes)
        de_citations_mentioned_sets.append(de_list_citations_mentioned_genes)
        
        ### Impute transition probabilities
        result_df = pd.DataFrame()
        result_df['gene_ncbi'] = list(de_list)
        result_df['comparison'] = comparison_key
        result_df['mentioned'] = result_df['gene_ncbi'].isin(de_list_mentioned_genes)
        result_df['mentioned_citations'] = result_df['gene_ncbi'].isin(de_list_citations_mentioned_genes)
        result_df_array.append(result_df)
        
    ### Collect unique genes
    de_list = set().union(*de_sets)
    de_list_mentioned = set().union(*de_mentioned_sets)
    de_list_mentioned_genes_null = set().union(*de_mentioned_sets_null)
    de_list_citations_mentioned = set().union(*de_citations_mentioned_sets)
    
    ### Collect data for each set
    result_df = pd.concat(result_df_array)
    
    helper = result_df.groupby('comparison').sum()#['mentioned']
    prohib_ids = helper[helper['mentioned'] == 0].index.values
    result_df = result_df[~result_df['comparison'].isin(prohib_ids)]
    result_df['hit'] = True
    result_df = result_df.rename(columns={'comparison':'pubmed_id'})
    print(str(len(set(result_df['pubmed_id'].values))) + ' valid transcriptomics articles')
    print(str(len(np.unique(citing_articles_array))) + ' valid citing articles')
    
    result_df.to_csv('../data/transcriptomics_all_hits_' + str(fdr_thresh) + '.csv', index=False)
    return result_df

In [17]:
result_df = get_genes(main, fdr_thresh=0.05)
get_genes(main, fdr_thresh=0.01)
get_genes(main, fdr_thresh=0.001)
get_genes(main, fdr_thresh=0.0001)
get_genes(main, fdr_thresh=0.00001)

250 transcriptomics articles
3482 citing articles
148 valid transcriptomics articles
1678 valid citing articles
235 transcriptomics articles
3224 citing articles
128 valid transcriptomics articles
1389 valid citing articles
219 transcriptomics articles
3077 citing articles
107 valid transcriptomics articles
1099 valid citing articles
192 transcriptomics articles
2579 citing articles
80 valid transcriptomics articles
886 valid citing articles
171 transcriptomics articles
2329 citing articles
68 valid transcriptomics articles
732 valid citing articles


Unnamed: 0,gene_ncbi,pubmed_id,mentioned,mentioned_citations,hit
0,2048,17452456,False,False,True
1,55297,17452456,False,False,True
2,6146,17452456,False,False,True
3,79875,17452456,False,False,True
4,79877,17452456,False,False,True
...,...,...,...,...,...
1842,81875,29844126,False,False,True
1843,131034,29844126,False,False,True
1844,8175,29844126,False,False,True
1845,90102,29844126,False,False,True


In [18]:
print(result_df[result_df['hit']]['gene_ncbi'].nunique())

print(result_df[result_df['mentioned']]['gene_ncbi'].nunique())

print(result_df[result_df['mentioned_citations']]['gene_ncbi'].nunique())

18295
161
692
