In [1]:
import numpy as np
import pandas as pd
import scipy

from IPython.display import clear_output

import sys
sys.path.append('../../../../Documents/GitHub/gustav/src/')

from gustav import ebi, ncbi, nlm, biogrid, nih
from gustav import publications
from gustav import github
from gustav import access_framework
from gustav import mapper

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append('../general/src/')
from manuscript import export
from manuscript import inout
from manuscript import datasets
from manuscript import tools

from sklearn.metrics import auc
from scipy.stats import fisher_exact
pd.options.display.precision = 3
pd.options.display.expand_frame_repr = False
pd.options.display.max_columns = 20

Number of publications, P(DE), selected vs not selected for title/abstract

In [2]:
%%time

taxon = 9606

gene_info = ncbi.gene_info(taxon, mode='unambiguous_ensembl')

ref_genes = gene_info[
    (gene_info['type_of_gene']=='protein-coding') &
    (gene_info['nomenclature_status']=='O')   # Carries an official gene symbol
]['gene_ncbi']

de = ebi.gxa(
    'de_{}_entrez'.format(taxon),
    columns=['comparison_key', 'gene_ncbi', 'p-value', 'log2foldchange']
)

de = de.drop_duplicates(subset=['comparison_key', 'gene_ncbi'], keep='first')

contrasts = ebi.gxa(
    'contrasts',
    columns=['comparison_key', 'experiment']
)

studies = ebi.gxa(
    'studies',
    columns=['experiment', 'qualifier', 'value']
)

study_types = studies[
    studies['qualifier'].str.contains('[AEExperimentType]', regex=False)
].drop_duplicates()
study_types['value'] = study_types['value'].str.strip(' ').str.strip('"')
study_types = study_types[['experiment', 'value']].rename(columns={'value': 'type'})
study_types = study_types.drop_duplicates()

study_types = study_types.drop_duplicates('experiment', keep=False)

study_types = study_types[
    study_types['type'].isin(
        [
           'transcription profiling by array',
           'RNA-seq of coding RNA' 
        ]
    )
].replace(
    {
       'transcription profiling by array': 'array',
       'RNA-seq of coding RNA': 'rna_seq' 
    }
)

main = pd.merge(
    de,
    pd.merge(
        contrasts,
        study_types
    )[['comparison_key', 'type']]
)

main = main[main['gene_ncbi'].isin(ref_genes)]

  df_with_delimiter.drop(column, 1).reset_index(),
  joined = joined.drop('helper_index', 1)


CPU times: total: 23.1 s
Wall time: 23.6 s


In [3]:
table = gene_info.set_index('gene_ncbi')

In [4]:
gene_flavor = 'ou'
ref_genes = datasets.reference_genes(taxon, gene_flavor)

  df_with_delimiter.drop(column, 1).reset_index(),
  joined = joined.drop('helper_index', 1)


In [5]:
ref_literature = datasets.reference_publications(taxon)

In [6]:
agg = []
literature_flavor = 'pubtator_title_or_abstract_in_any_gene2pubmed_paper'

gene2lit = datasets.reference_gene2lit(
    taxon, 
    literature_flavor)
gene2lit = gene2lit[
    gene2lit['gene_ncbi'].isin(ref_genes) & 
    gene2lit['pubmed_id'].isin(ref_literature)
].copy()

In [7]:
studies.loc[(studies['qualifier'] == 'PubMed ID') | 
        (studies['qualifier'] == 'Pubmed ID') | 
        (studies['qualifier'] == 'pubmedID') | 
        (studies['qualifier'] == 'PubMed ID       ') | 
        (studies['qualifier'] == '''"Pubmed ID"''') | 
        (studies['qualifier'] == 'PubMedID') | 
        (studies['qualifier'] == 'PubMed Id') | 
        (studies['qualifier'] == 'PubmedID') |
        (studies['qualifier'] == 'PubMed ID '), ['qualifier']] = 'pubmed_id'

pubmed_ids_df  = studies[studies['qualifier'] == 'pubmed_id']

pubmed_ids_df = pd.merge(contrasts, pubmed_ids_df, how='left').rename(columns={'value':'pubmed_id'})

In [8]:
%%time
gene2pubmed = gene2lit #ncbi.gene2pubmed(taxon_ncbi=9606)
large_pubs = gene2pubmed['pubmed_id'].value_counts()[gene2pubmed['pubmed_id'].value_counts() >= 100].index.values
#gene2pubmed = gene2pubmed[~gene2pubmed['pubmed_id'].isin(large_pubs)]

gene2pubmed_dict = gene2pubmed[~gene2pubmed['pubmed_id'].isin(large_pubs)].value_counts('gene_ncbi').to_dict()

CPU times: total: 109 ms
Wall time: 119 ms


In [9]:
gene2pubmed_get_genes_dict = gene2pubmed.groupby('pubmed_id')['gene_ncbi'].apply(set).to_dict()

In [10]:
%%time
pubmed_ids_df = pubmed_ids_df[(pubmed_ids_df['comparison_key'].isin(np.unique(main['comparison_key']))) & 
              (pubmed_ids_df['pubmed_id'].isin(gene2pubmed['pubmed_id'].astype(str)))].dropna()\

pubmed_ids_df['pubmed_id'] = pubmed_ids_df['pubmed_id'].astype(int)

CPU times: total: 922 ms
Wall time: 920 ms


In [11]:
pubmed_id_to_comparison_dict = pubmed_ids_df.groupby('pubmed_id')['comparison_key'].apply(set).to_dict()

In [12]:
%%time
fdr_thresh = 0.05
de_cond = (main['p-value'] < fdr_thresh)

main['abs_log2foldchange'] = np.abs(main['log2foldchange'])
median_abs_log2foldchange = main[de_cond].groupby('comparison_key').median()['abs_log2foldchange']

main_slice = pd.merge(pd.merge(pubmed_ids_df, gene2pubmed), main[de_cond], on=['comparison_key', 'gene_ncbi'])
median_abs_log2foldchange_mentioned_genes = main_slice.groupby('comparison_key').median()['abs_log2foldchange']

comparison_by_comparison_df = \
pd.DataFrame({'median_abs_log2foldchange_all_genes':median_abs_log2foldchange, 
              'median_abs_log2foldchange_mentioned_genes':median_abs_log2foldchange_mentioned_genes})

comparison_by_comparison_df = pd.merge(comparison_by_comparison_df.reset_index(), 
                                       main[['comparison_key', 'type']].drop_duplicates())

de_dict = main[de_cond].groupby('comparison_key')['gene_ncbi'].apply(set).to_dict()

main_slice = pd.merge(pd.merge(pubmed_ids_df, gene2pubmed), main[de_cond], on=['comparison_key', 'gene_ncbi'], how='inner')
de_dict_mentioned_genes = main_slice.groupby('comparison_key')['gene_ncbi'].apply(set).to_dict()

CPU times: total: 11.4 s
Wall time: 11.4 s


In [13]:
def de_dict_get(comparison_key):
    if comparison_key in list(de_dict.keys()):
        return de_dict.get(comparison_key)
    else:
        return set()

In [14]:
prot_genes = set(gene_info[gene_info['type_of_gene'] == 'protein-coding']['gene_ncbi'])
medianprops = dict(linestyle='solid', linewidth=2, color='k')
whiskerprops = dict(linestyle='solid', linewidth=2, color='k')
capprops = dict(linestyle='solid', linewidth=2, color='k')

pad = 0.2

# both
comparison_keys = set(comparison_by_comparison_df['comparison_key'].values)
comparison_keys = list(comparison_keys & set(pubmed_ids_df['comparison_key'].values))
print('N RNA-seq comparisons : ' + str(len(comparison_keys)))
print('N RNA-seq studies : ' + \
      str(len(pubmed_ids_df[pubmed_ids_df['comparison_key'].isin(comparison_keys)]['pubmed_id'].unique())))

n_samp = 100
de_sets = []
de_mentioned_sets = []
#de_mentioned_sets_null = []
de_citations_mentioned_sets = []
citations_mentioned_sets = []
for comparison_key in comparison_keys:
    de_list = de_dict.get(comparison_key)
    #de_list_mentioned_genes = de_dict_mentioned_genes.get(comparison_key)
    
    if de_dict_mentioned_genes.get(comparison_key):
        de_list_mentioned_genes = de_dict_mentioned_genes.get(comparison_key)
    else:
        de_list_mentioned_genes = set()
        
    #for n_n in range(n_samp):
    # de_mentioned_sets_null.append(set(np.random.choice(list(de_list), 
    #                                                       replace=False, size=len(de_list_mentioned_genes))))
        
    de_sets.append(de_list)
    de_mentioned_sets.append(de_list_mentioned_genes)
    
de_list = set().union(*de_sets) & prot_genes
de_list_mentioned_genes = set().union(*de_mentioned_sets) & prot_genes
#de_list_mentioned_genes_null = set().union(*de_mentioned_sets_null)

N RNA-seq comparisons : 501
N RNA-seq studies : 250


In [15]:
len(de_list)

18408

In [16]:
len(de_list_mentioned_genes)

161

In [17]:
def make_pivot(main_df, type_exp='array'):
    if type_exp == None:
        pivot = main.pivot_table(index='gene_ncbi', 
                                 columns='comparison_key', 
                                 values='differentially_expressed')
    else:
        pivot = main[main['type'] == type_exp].pivot_table(index='gene_ncbi', 
                                                           columns='comparison_key', 
                                                           values='differentially_expressed')
    return pivot
        
main['differentially_expressed'] = main['p-value'] < 0.05

pivot = make_pivot(main, type_exp='rna_seq')

In [18]:
p_de = pivot.sum(axis=1)/(len(pivot.columns) - pivot.isna().sum(axis=1))

In [19]:
table['p_de'] = p_de

In [20]:
table['n_pubs'] = [gene2pubmed_dict.get(x) for x in table.index.values]
table['n_pubs'] = table['n_pubs'].fillna(0.0)

In [21]:
table['defined_hugo'] = table['symbol_from_nomenclature_authority'] != '-'

In [22]:
table['n_synonyms'] = table['synonyms'].apply(lambda x: len(x.split('|')))

In [23]:
table.loc[table['synonyms'] == '-', 'n_synonyms'] = 0

In [24]:
#table.to_csv('../data/main_table_221115.csv')

N mouse phenotypes

In [25]:
homologene = ncbi.homologene()

mouse_homology_table = homologene[homologene['taxon_ncbi'].isin([10090, 9606])]\
.drop_duplicates(subset=['homologene_group', 'taxon_ncbi'])\
.pivot(index='homologene_group', columns='taxon_ncbi', values='gene_ncbi').dropna()

impc = ebi.impc(dataset='phenotype_hits_per_gene')
impc = pd.merge(impc, mouse_homology_table, left_on='gene_ncbi', right_on=10090)[['phenotype', 9606]]
impc.columns = ['phenotype', 'gene_ncbi']

mouse_pheno_dict = impc[impc['gene_ncbi'].isin(gene_info.index.values)]['gene_ncbi'].value_counts().to_dict()
table['n_mouse_pheno'] = [mouse_pheno_dict.get(x) for x in table.index.values]
table['n_mouse_pheno'] = table['n_mouse_pheno'].fillna(0.0)
table['mouse_pheno'] = table['n_mouse_pheno'] > 0

N GWAS hits

In [26]:
gwas = ebi.gwas(dataset='associations')

gwas_gene_ids = []
for val in gwas['snp_gene_ids'].values:
    if type(val) == str:
        if ',' in val:
            val = val.split(', ')
            gwas_gene_ids.append(val)
        else:
            gwas_gene_ids.append([val])
            
gwas_gene_ids = [item for sublist in gwas_gene_ids for item in sublist]

for val in gwas['upstream_gene_id'].dropna().values:
    gwas_gene_ids.append(val)
    
for val in gwas['downstream_gene_id'].dropna().values:
    gwas_gene_ids.append(val)
    
gwas_df = pd.DataFrame({'gene_ensembl':gwas_gene_ids}).value_counts().reset_index().rename(columns={0:'n_gwas'})
gwas_df = pd.merge(gwas_df, gene_info[['gene_ensembl', 'gene_ncbi']])

In [27]:
table['n_gwas'] = gwas_df.set_index('gene_ncbi')['n_gwas']
table['n_gwas'] = table['n_gwas'].fillna(0.0)

RNA expression-based measures

In [28]:
hpa_tissue = pd.read_csv('../data/rna_tissue_consensus.tsv', sep='\t')
hpa_tissue = pd.merge(hpa_tissue, 
                      gene_info[['gene_ensembl','gene_ncbi']], 
                      left_on='Gene', right_on='gene_ensembl')
hpa_tissue['detectable'] = hpa_tissue['nTPM'] >= 1

In [29]:
table['detectable_portion'] = hpa_tissue.groupby('gene_ncbi').mean()['detectable']

In [30]:
table['tissue_median'] = hpa_tissue.groupby('gene_ncbi').median()['nTPM']

In [31]:
hpa_cell = pd.read_csv('../data/rna_celline.tsv', sep='\t')
hpa_cell = pd.merge(hpa_cell, 
                      gene_info[['gene_ensembl','gene_ncbi']], 
                      left_on='Gene', right_on='gene_ensembl')
hpa_cell = hpa_cell[hpa_cell['Cell line'] == 'HeLa']
table['hela_expression'] = hpa_cell.set_index('gene_ncbi')['nTPM']

Mendelian inheritance

In [32]:
auto_dom_df = pd.read_csv('../data/HPO_autosomal_dominant_genes_220920.csv')
auto_rec_df = pd.read_csv('../data/HPO_autosomal_recessive_genes_220920.csv')
auto_dom_df = auto_dom_df.rename(columns={'GENE_ENTREZ_ID':'gene_ncbi'})
auto_rec_df = auto_rec_df.rename(columns={'GENE_ENTREZ_ID':'gene_ncbi'})
mendel_df = pd.concat([auto_dom_df, auto_rec_df])

In [33]:
table['mendelian_inheritance'] = table.index.isin(mendel_df['gene_ncbi'])

Model organisms & primate specificity

In [34]:
homologene = ncbi.homologene()

In [35]:
#homologene['taxon_ncbi'] = homologene['taxon_ncbi'].replace(559292, 4932)

In [36]:
human_taxid = 9606
mouse_taxid = 10090
rat_taxid = 10116
celegans_taxid = 6239
dmelanogaster_taxid = 7227
yeast_taxid = 559292
drerio_taxid = 7955

homologene_pivot = homologene[homologene['taxon_ncbi'].isin([human_taxid, 
                                                             mouse_taxid, 
                                                             rat_taxid, 
                                                             celegans_taxid, 
                                                             dmelanogaster_taxid, 
                                                             yeast_taxid, 
                                                             drerio_taxid])]\
.drop_duplicates(subset=['homologene_group', 'taxon_ncbi']).pivot(index='homologene_group', columns='taxon_ncbi') > 0
homologene_pivot = homologene_pivot['gene_ncbi'].reset_index().reset_index(drop=True)

homologene_pivot = pd.merge(homologene[homologene['taxon_ncbi'] == human_taxid][['gene_ncbi', 'homologene_group']], 
                            homologene_pivot, 
                            how='left')\
[['gene_ncbi', mouse_taxid, rat_taxid, celegans_taxid, dmelanogaster_taxid, yeast_taxid, drerio_taxid]]
homologene_pivot = homologene_pivot.rename(columns={mouse_taxid:'mouse', 
                                                    rat_taxid:'rat', 
                                                    celegans_taxid:'c_elegans', 
                                                    dmelanogaster_taxid:'d_melanogaster', 
                                                    yeast_taxid:'yeast', 
                                                    drerio_taxid:'zebrafish'})

In [37]:
table = pd.merge(table.reset_index(), 
         pd.merge(table.reset_index(), homologene_pivot, how='left')\
[['gene_ncbi', 'mouse', 'rat', 'c_elegans', 'd_melanogaster', 'yeast', 'zebrafish']].fillna(False), how='left')\
.set_index('gene_ncbi')

In [38]:
homologene['primate'] = homologene['taxon_ncbi'].isin(pd.read_csv('../data/primate_taxonid_list.txt', header=None)[0].values)

In [39]:
homologene_groupby = homologene.groupby('homologene_group').mean()

In [40]:
primate_spec_genes = homologene[homologene['homologene_group'].isin(\
                                               homologene_groupby[homologene_groupby['primate'] == 1.0].index.values)
          & (homologene['taxon_ncbi'] == 9606)]['gene_ncbi'].values

In [41]:
table['primate_specific'] = table.index.isin(primate_spec_genes)

Publications in model organisms

In [42]:
%%time
taxon_name_dict = {mouse_taxid:'mouse', rat_taxid:'rat', celegans_taxid:'c_elegans', 
 dmelanogaster_taxid:'d_melanogaster', drerio_taxid:'zebrafish', yeast_taxid:'yeast'}

for taxon, org in taxon_name_dict.items():

    homologene_merge = pd.merge(homologene[homologene['taxon_ncbi'].isin([human_taxid])][['homologene_group', 'gene_ncbi']],
             homologene[homologene['taxon_ncbi'].isin([taxon])][['homologene_group', 'gene_ncbi']], 
             on='homologene_group')
    homologene_merge.columns = ['homologene_group', 'gene_ncbi_human', 'gene_ncbi_model']

    gene_flavor = ''
    
    ref_genes = datasets.reference_genes(taxon, gene_flavor)

    ref_literature = datasets.reference_publications(taxon)

    agg = []
    literature_flavor = 'pubtator_title_or_abstract_in_any_gene2pubmed_paper'

    gene2lit = datasets.reference_gene2lit(
            taxon, 
            literature_flavor)
        
    gene2lit = gene2lit[
        gene2lit['gene_ncbi'].isin(ref_genes) & 
        gene2lit['pubmed_id'].isin(ref_literature)
    ].copy()

    gene2lit = pd.merge(gene2lit, homologene_merge, left_on='gene_ncbi', right_on='gene_ncbi_model')

    table['n_' + org + '_pubs'] = (
        gene2lit.drop_duplicates(
        subset=['pubmed_id', 'gene_ncbi_human']).
        groupby('gene_ncbi_human').count()['pubmed_id']
    )

    table['n_' + org + '_pubs'] = table.loc[table[org]]['n_' + org + '_pubs'].fillna(0.0)

CPU times: total: 1min 1s
Wall time: 1min 1s


HPA

In [43]:
hpa_df = pd.read_csv('../data/proteinatlas_19002e78.tsv', sep='\t')

In [44]:
hpa_df = mapper.symbol_2_gene_ncbi(hpa_df.rename(columns={'Gene':'symbol'}).set_index('symbol'), taxon_ncbi=9606)

In [45]:
hpa_df = hpa_df[~hpa_df.index.duplicated(keep='first')]

In [46]:
table['nextprot_evidence'] = hpa_df['NeXtProt evidence'] == 'Evidence at protein level'
table['nextprot_evidence'] = table['nextprot_evidence'].fillna(False)

In [47]:
table['hpa_evidence'] = hpa_df['HPA evidence'] == 'Evidence at protein level'
table['hpa_evidence'] = table['hpa_evidence'].fillna(False)

In [48]:
table['uniprot_evidence'] = hpa_df['UniProt evidence'] == 'Evidence at protein level'
table['uniprot_evidence'] = table['uniprot_evidence'].fillna(False)

In [49]:
table['membrane_protein'] = hpa_df['Protein class'].str.contains('membrane proteins')
table['membrane_protein'] = table['membrane_protein'].fillna(False)

In [50]:
table['antibody'] = ~hpa_df['Antibody'].isna()
table['antibody'] = table['antibody'].fillna(False)

In [51]:
for col in hpa_df.columns:
    print(col)

Gene synonym
Ensembl
Gene description
Uniprot
Chromosome
Position
Protein class
Biological process
Molecular function
Disease involvement
Evidence
HPA evidence
UniProt evidence
NeXtProt evidence
RNA tissue specificity
RNA tissue distribution
RNA tissue specificity score
RNA tissue specific nTPM
RNA single cell type specificity
RNA single cell type distribution
RNA single cell type specificity score
RNA single cell type specific nTPM
RNA cancer specificity
RNA cancer distribution
RNA cancer specificity score
RNA cancer specific FPKM
RNA brain regional specificity
RNA brain regional distribution
RNA brain regional specificity score
RNA brain regional specific nTPM
RNA blood cell specificity
RNA blood cell distribution
RNA blood cell specificity score
RNA blood cell specific nTPM
RNA blood lineage specificity
RNA blood lineage distribution
RNA blood lineage specificity score
RNA blood lineage specific nTPM
RNA cell line specificity
RNA cell line distribution
RNA cell line specificity scor

In [52]:
table['approved_ih'] = hpa_df['Reliability (IH)'].isin(['Supported', 'Approved', 'Enhanced'])
table['approved_ih'] = table['approved_ih'].fillna(False)
table['approved_if'] = hpa_df['Reliability (IF)'].isin(['Supported', 'Approved', 'Enhanced'])
table['approved_if'] = table['approved_if'].fillna(False)

IDG understudied proteins

In [53]:
table['idg_understudied'] = table['symbol_from_nomenclature_authority']\
.isin(pd.read_json('../data/IDG_TargetList_CurrentVersion.json')['Gene'].values)

AddGene

In [54]:
import json

with open('../data/addgene-list-data.json') as f:
    data = json.loads(f.read())
    
addgene = pd.DataFrame(data['data'])

In [55]:
table['plasmid'] = table.index.isin(addgene['entrez-id'].values.astype(int))

CHEMBL

In [56]:
chembl_df = pd.read_csv('../data/chembl_220920.tsv', sep='\t')

In [57]:
uniprot = ebi.uniprot(dataset='uniprot_to_ncbi_gene', taxon_ncbi='vip')

In [58]:
uniprot_ids = []

for entry in chembl_df['UniProt Accessions']:
    if type(entry) == str:
        entries = entry.split('|')
        for uniprot_id in entries:
            uniprot_ids.append(uniprot_id)

In [59]:
druggable_genes = uniprot[uniprot['protein_uniprot'].isin(uniprot_ids) & (uniprot['taxon_ncbi'] == 9606)]['gene_ncbi'].values

In [60]:
table['compound'] = table.index.isin(druggable_genes)

Various annotations

In [61]:
%%time
msigdb = pd.read_parquet('../data/msigdb.parquet')

msigdb_names = pd.read_xml('../data/msigdb_v7.5.xml').dropna(subset=['MEMBERS_EZID'])

msigdb_names_slice = msigdb_names[msigdb_names['CONTRIBUTOR_ORG'].isin(['Gene Ontology Consortium', 
                                      'Reactome', 
                                      'The Jackson Laboratory (JAX)', 
                                      'WikiPathways', 
                                      'NCI, NIH and Nature Publishing Group', 
                                      'Kyoto Encyclopedia of Genes and Genomes', 'BioCarta'])]
curated_sets = msigdb_names_slice['SYSTEMATIC_NAME'].values

CPU times: total: 6.02 s
Wall time: 6.03 s


In [62]:
msigdb = msigdb[msigdb['year'] == 2022]

In [63]:
contributor_dict = msigdb_names_slice.groupby('CONTRIBUTOR_ORG')['SYSTEMATIC_NAME'].apply(list).to_dict()

In [64]:
name = 'n_biocarta'
contributor = 'BioCarta'
table[name] = \
msigdb[msigdb['SYSTEMATIC_NAME'].isin(contributor_dict.get(contributor))].groupby('gene_ncbi').count()['year']
table[name] = table[name].fillna(0)

In [65]:
name = 'n_reactome'
contributor = 'Reactome'
table[name] = \
msigdb[msigdb['SYSTEMATIC_NAME'].isin(contributor_dict.get(contributor))].groupby('gene_ncbi').count()['year']
table[name] = table[name].fillna(0)

In [66]:
name = 'n_hpo'
contributor = 'The Jackson Laboratory (JAX)'
table[name] = \
msigdb[msigdb['SYSTEMATIC_NAME'].isin(contributor_dict.get(contributor))].groupby('gene_ncbi').count()['year']
table[name] = table[name].fillna(0)

In [67]:
name = 'n_wikipathways'
contributor = 'WikiPathways'
table[name] = \
msigdb[msigdb['SYSTEMATIC_NAME'].isin(contributor_dict.get(contributor))].groupby('gene_ncbi').count()['year']
table[name] = table[name].fillna(0)

In [68]:
name = 'n_pid'
contributor = 'NCI, NIH and Nature Publishing Group'
table[name] = \
msigdb[msigdb['SYSTEMATIC_NAME'].isin(contributor_dict.get(contributor))].groupby('gene_ncbi').count()['year']
table[name] = table[name].fillna(0)

In [69]:
name = 'n_kegg'
contributor = 'Kyoto Encyclopedia of Genes and Genomes'
table[name] = \
msigdb[msigdb['SYSTEMATIC_NAME'].isin(contributor_dict.get(contributor))].groupby('gene_ncbi').count()['year']
table[name] = table[name].fillna(0)

In [70]:
name = 'n_go'
contributor = 'Gene Ontology Consortium'
table[name] = \
msigdb[msigdb['SYSTEMATIC_NAME'].isin(contributor_dict.get(contributor))].groupby('gene_ncbi').count()['year']
table[name] = table[name].fillna(0)

In [71]:
table['protein_coding'] = table['type_of_gene'] == 'protein-coding'

Gene length

In [72]:
gene_length = access_framework.genbank_gene(9606)[['gene_ncbi','SumACGT']]

In [73]:
table['gene_length'] = gene_length.set_index('gene_ncbi')['SumACGT']

Loss of function tolerance

In [74]:
df = publications.karczewski_2020(dataset='full_constraint_metrics', to_ncbi='unambiguous')
df = df[df['canonical']].reset_index()
df = df[['gene_ncbi', 'pli']].dropna()
df['loss_of_function_intolerant'] = df['pli'] > 0.9
df = df.sort_values('loss_of_function_intolerant', 
                    ascending=False).drop_duplicates(subset=['gene_ncbi'], keep='first')
table['loss_of_function_intolerant'] = df.set_index('gene_ncbi')['loss_of_function_intolerant']

Patents

In [75]:
patented = pd.read_csv('../data/patented_genes_rosenfeld_2013.csv', header=3)['Matching Gene'].values

In [76]:
table['previously_patented'] = table['symbol_from_nomenclature_authority'].isin(patented)

Hydrophobicity

In [77]:
amino_acid_derived = access_framework.aminoacids_swissprot(9606)

In [78]:
table['normalized_gravy'] = amino_acid_derived.set_index('gene_ncbi')['gravy_ignoring_O_and_U']

Druggability

In [79]:
druggability = pd.read_csv('../data/druggability_finan_et_al_2017.csv')

In [80]:
table['druggable'] = table['gene_ensembl'].isin(druggability['ensembl_gene_id'])

In [81]:
table.to_csv('../data/main_table_221115.csv')

Subject counts

In [82]:
table = pd.read_csv('../data/main_table_221115.csv')

  table = pd.read_csv('../data/main_table_221115.csv')


In [83]:
gene2pubmed_counts = pd.read_csv('../data/gene2pubmed_subject_counts_221005.csv')
top_terms = gene2pubmed_counts.columns[1:]

In [84]:
table = pd.merge(table, gene2pubmed_counts, how='left')

In [85]:
table[top_terms] = table[top_terms].fillna(0.0)

In [86]:
table.to_csv('../data/main_table_with_subject_counts_221116.csv', index=False)

In [87]:
table = pd.read_csv('../data/main_table_with_subject_counts_221116.csv')

  table = pd.read_csv('../data/main_table_with_subject_counts_221116.csv')


In [88]:
pd.DataFrame(table.columns).to_csv('../data/main_table_columns.csv', index=False, header=None)