In [1]:
import numpy as np
import pandas as pd
import scipy

from IPython.display import clear_output

import sys
sys.path.append('../../../../Documents/GitHub/gustav/src/')

from gustav import ebi, ncbi, nlm, biogrid, nih
from gustav import publications
from gustav import github
from gustav import access_framework
from gustav import mapper

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append('../general/src/')
from manuscript import export
from manuscript import inout
from manuscript import datasets
from manuscript import tools

from sklearn.metrics import auc
from scipy.stats import fisher_exact
pd.options.display.precision = 3
pd.options.display.expand_frame_repr = False
pd.options.display.max_columns = 20



In [2]:
taxon = 9606
gene_flavor = 'ou'
ref_genes = datasets.reference_genes(taxon, gene_flavor)
gene_info = ncbi.gene_info(taxon, mode='unambiguous_ensembl')

  df_with_delimiter.drop(column, 1).reset_index(),
  joined = joined.drop('helper_index', 1)
  df_with_delimiter.drop(column, 1).reset_index(),
  joined = joined.drop('helper_index', 1)


In [3]:
ref_literature = datasets.reference_publications(taxon)

In [4]:
agg = []
literature_flavor = 'pubtator_title_or_abstract_in_any_gene2pubmed_paper'

gene2lit = datasets.reference_gene2lit(
    taxon, 
    literature_flavor)
gene2lit = gene2lit[
    gene2lit['gene_ncbi'].isin(ref_genes) & 
    gene2lit['pubmed_id'].isin(ref_literature)
].copy()

In [5]:
%%time
gene2pubmed = gene2lit #ncbi.gene2pubmed(taxon_ncbi=9606)
large_pubs = gene2pubmed['pubmed_id'].value_counts()[gene2pubmed['pubmed_id'].value_counts() >= 100].index.values
#gene2pubmed = gene2pubmed[~gene2pubmed['pubmed_id'].isin(large_pubs)]

gene2pubmed_dict = gene2pubmed[~gene2pubmed['pubmed_id'].isin(large_pubs)].value_counts('gene_ncbi').to_dict()

CPU times: total: 156 ms
Wall time: 131 ms


# BioGRID

In [6]:
biogrid_df = biogrid.biogrid('all')

In [7]:
# all records
biogrid_df['publication_source'].value_counts()

PUBMED:27708008    326623
PUBMED:20093466     68044
PUBMED:28514442     56431
PUBMED:32296183     52408
PUBMED:22681890     40846
                    ...  
PUBMED:25429067         1
PUBMED:21912673         1
PUBMED:19330023         1
PUBMED:23298890         1
PUBMED:1527169          1
Name: publication_source, Length: 62154, dtype: int64

In [8]:
# In Aff-MS
biogrid_df = biogrid_df[(biogrid_df['throughput'] == 'High Throughput') & 
        (biogrid_df['experimental_system'] == 'Affinity Capture-MS')]
print(biogrid_df['publication_source'].value_counts())

# Bait proteins are human
biogrid_df = biogrid_df[biogrid_df['gene_ncbi_interactor_b'].isin(gene_info['gene_ncbi'])]
print(biogrid_df['publication_source'].value_counts())#['publication_source'].value_counts()

PUBMED:28514442    56431
PUBMED:26496610    29089
PUBMED:26186194    23669
PUBMED:22036573    10917
PUBMED:16429126     7592
                   ...  
PUBMED:25139236        1
PUBMED:23913922        1
PUBMED:26718004        1
PUBMED:16222229        1
PUBMED:24277934        1
Name: publication_source, Length: 1235, dtype: int64
PUBMED:28514442    55822
PUBMED:26496610    28952
PUBMED:26186194    23379
PUBMED:27173435     4885
PUBMED:21145461     4487
                   ...  
PUBMED:31159502        1
PUBMED:20388777        1
PUBMED:24067369        1
PUBMED:29228602        1
PUBMED:19786094        1
Name: publication_source, Length: 683, dtype: int64


In [9]:
biogrid_df['pubmed_id'] = biogrid_df['publication_source'].apply(lambda x: int(x.split(':')[1]))

In [10]:
biogrid_df = biogrid_df[biogrid_df['pubmed_id'].isin(gene2pubmed['pubmed_id'])]

In [11]:
# Gene in title/abstract
biogrid_df['pubmed_id'].value_counts()

27684187    3484
28515276    2785
31091453    2761
29845934    2217
29128334    1964
            ... 
30760284       1
26975375       1
29434030       1
30675521       1
29426904       1
Name: pubmed_id, Length: 461, dtype: int64

In [12]:
biogrid_df = biogrid_df.drop_duplicates(['pubmed_id', 'gene_ncbi_interactor_b']).copy()

In [13]:
biogrid_df['gene_ncbi'] = biogrid_df['gene_ncbi_interactor_b'].copy()

In [14]:
#biogrid_df[['pubmed_id']].drop_duplicates().to_csv('../data/aff_ms_pubmed_ids.csv', index=False)

In [15]:
%%time
icite = nih.icite(dataset='citations')

CPU times: total: 19.1 s
Wall time: 20.5 s


In [16]:
def get_genes(gwas):
    
    prot_genes = set(gene_info[gene_info['type_of_gene'] == 'protein-coding']['gene_ncbi'])
    
    # apply filters and p-value threshold here
    gwas = gwas[(gwas['pubmed_id'].isin(gene2pubmed['pubmed_id']))].copy()
    
    print(str(len(set(gwas['pubmed_id'].values))) + ' Aff-MS articles')
    
    icite_slice = icite[icite['referenced'].isin(gwas['pubmed_id'])].copy()
    icite_slice = pd.merge(icite_slice, gene2pubmed, left_on='citing', right_on='pubmed_id')
    
    print(str(icite_slice['citing'].nunique()) + ' citing articles')
    
    de_dict = gwas.groupby('pubmed_id')['gene_ncbi'].apply(set).to_dict()
    de_dict_mentioned_genes = (
        gene2pubmed[gene2pubmed['pubmed_id']
                    .isin(gwas['pubmed_id'])]
        .groupby('pubmed_id')['gene_ncbi']
        .apply(set).to_dict()
    )
    de_dict_citations_mentioned_genes = icite_slice.groupby('referenced')['gene_ncbi'].apply(set).to_dict()

    np.random.seed(49)
    n_samp = 100
    de_sets = []
    de_mentioned_sets = []
    de_mentioned_sets_null = []
    de_citations_mentioned_sets = []
    citations_mentioned_sets = []
    result_df_array = []
    citing_articles_array = []
    for comparison_key in np.unique(gwas['pubmed_id'].values):
        
        ### Collect all sets of unique genes
        de_list = de_dict.get(comparison_key) & prot_genes

        if de_dict_mentioned_genes.get(comparison_key):
            de_list_mentioned_genes = de_dict_mentioned_genes.get(comparison_key) & de_list
        else:
            de_list_mentioned_genes = set()
            
        if de_dict_citations_mentioned_genes.get(comparison_key):
            de_list_citations_mentioned_genes = de_dict_citations_mentioned_genes.get(comparison_key) & de_list
        else:
            de_list_citations_mentioned_genes = set()

        citing_articles = []
        citing_articles = icite_slice[(icite_slice['referenced'] == comparison_key) &
                                          icite_slice['gene_ncbi'].isin(de_list)]['citing'].values
        for citing_article in citing_articles:
            citing_articles_array.append(citing_article)
            
        for n_n in range(n_samp):
            de_mentioned_sets_null.append(set(np.random.choice(list(de_list), 
                                                               replace=False, size=len(de_list_mentioned_genes))))

        de_sets.append(de_list)
        de_mentioned_sets.append(de_list_mentioned_genes)
        de_citations_mentioned_sets.append(de_list_citations_mentioned_genes)
        
        ### Impute transition probabilities
        result_df = pd.DataFrame()
        result_df['gene_ncbi'] = list(de_list)
        result_df['comparison'] = comparison_key
        result_df['mentioned'] = result_df['gene_ncbi'].isin(de_list_mentioned_genes)
        result_df['mentioned_citations'] = result_df['gene_ncbi'].isin(de_list_citations_mentioned_genes)
        result_df_array.append(result_df)
        
    ### Collect unique genes
    de_list = set().union(*de_sets)
    de_list_mentioned = set().union(*de_mentioned_sets)
    de_list_mentioned_genes_null = set().union(*de_mentioned_sets_null)
    de_list_citations_mentioned = set().union(*de_citations_mentioned_sets)
    
    ### Collect data for each set
    result_df = pd.concat(result_df_array)
    
    helper = result_df.groupby('comparison').sum()#['mentioned']
    prohib_ids = helper[helper['mentioned'] == 0].index.values
    result_df = result_df[~result_df['comparison'].isin(prohib_ids)]
    result_df['hit'] = True
    result_df = result_df.rename(columns={'comparison':'pubmed_id'})
    print(str(len(set(result_df['pubmed_id'].values))) + ' valid Aff-MS articles')
    print(str(len(np.unique(citing_articles_array))) + ' valid citing articles')
    
    result_df.to_csv('../data/aff_ms_all_hits.csv', index=False)
    return result_df

In [17]:
result_df = get_genes(biogrid_df)

461 Aff-MS articles
3239 citing articles
296 valid Aff-MS articles
1320 valid citing articles


In [18]:
print(result_df[result_df['hit']]['gene_ncbi'].nunique())

print(result_df[result_df['mentioned']]['gene_ncbi'].nunique())

print(result_df[result_df['mentioned_citations']]['gene_ncbi'].nunique())

7919
311
407
