### Import

In [105]:
import pandas as pd
import numpy as np
df = pd.read_csv("../analysis/ncbi_info_20220719_filtered.csv")

  df = pd.read_csv("../analysis/ncbi_info_20220719_filtered.csv")


### Drop all columns besides ENSG ID, gene_symbol, and alias_symbol

In [90]:
mini_ncbi_df = df.drop(['Unnamed: 0', '#tax_id','GeneID', 'dbXrefs', 'description', 'type_of_gene', 'Symbol_from_nomenclature_authority', 'Full_name_from_nomenclature_authority', 'Other_designations', 'MIM', 'HGNC', 'AllianceGenome','MIRbase', 'IMGTgene_db', 'dash', 'unknown'], axis=1)
mini_ncbi_df = mini_ncbi_df.rename(columns = {'Symbol':'gene_symbol','Synonyms':'alias_symbol', 'ENSEMBL':'ENSG ID'})
mini_ncbi_df['ENSG ID'] = mini_ncbi_df['ENSG ID'].astype(str)
mini_ncbi_df['ENSG ID'] = mini_ncbi_df['ENSG ID'].apply(str.upper)
mini_ncbi_df.head(1)

Unnamed: 0,gene_symbol,alias_symbol,ENSG ID
0,A1BG,A1B|ABG|GAB|HYST2477,ENSG00000121410


### The gene_symbol column has duplicates
##### (I ran set intersection b/w gene & alias and rcvd error saying set lengths didnt match)

In [91]:
mini_ncbi_df['symbol_duplicates'] = mini_ncbi_df.duplicated(subset= 'gene_symbol', keep=False)
dup_symbol_mini_ncbi_df = mini_ncbi_df[mini_ncbi_df['symbol_duplicates'] == True]
len(dup_symbol_mini_ncbi_df)

205

##### What is the difference b/w these duplicates? Can we get rid of them? 
##### It looks like the rows with aliases have all of the information as those that do not, so I think it would be safe to keep those with aliases and discard those without.

In [92]:
df[df['Symbol'].str.contains('RNR1')]

Unnamed: 0.1,Unnamed: 0,#tax_id,GeneID,Symbol,Synonyms,dbXrefs,description,type_of_gene,Symbol_from_nomenclature_authority,Full_name_from_nomenclature_authority,Other_designations,MIM,HGNC,ENSEMBL,AllianceGenome,MIRbase,IMGTgene_db,dash,unknown
3576,3576,9606,4549,RNR1,MTRNR1,MIM:561000|HGNC:HGNC:7470|AllianceGenome:HGNC:...,s-rRNA,rRNA,MT-RNR1,mitochondrially encoded 12S RNA,-,561000.0,7470.0,,hgnc:7470,,,,
4794,4794,9606,6052,RNR1,-,MIM:180450|HGNC:HGNC:10082,"RNA, ribosomal 45S cluster 1",other,RNR1,"RNA, ribosomal 45S cluster 1","45S rDNA cluster 1|RNA, ribosomal 1|RNA, ribos...",180450.0,10082.0,,,,,,


In [93]:
unique_in_dup = set(dup_symbol_mini_ncbi_df.gene_symbol)
len(unique_in_dup)

51

##### Sort the duplicates so that the ones with empty alias_symbols columns would be first and dropped

In [94]:
subset = ['gene_symbol']

mini_ncbi_df.sort_values(subset + ['alias_symbol'], inplace=True)
mini_ncbi_df.drop_duplicates(subset, keep='last', inplace=True)

len(mini_ncbi_df)

75346

75346 This number is good. Original df had 75500 rows. Duplicated rows total was 205. Unique values are 51. 205-51 is 154 duplicates I need to get rid of. 75500-154 is 75346. Also RNR1 is one I knew had a duplicate and it doesnt anymore

In [95]:
mini_ncbi_df = mini_ncbi_df.drop(['symbol_duplicates'], axis=1)
mini_ncbi_df[mini_ncbi_df['gene_symbol'].str.contains('RNR1')]

Unnamed: 0,gene_symbol,alias_symbol,ENSG ID
3576,RNR1,MTRNR1,NAN


### Make all of gene_symbol a set

In [96]:
all_gene_symbol_set = set(mini_ncbi_df['gene_symbol'])

### Make each row in alias_symbol a set:
    covert to a list 
    make a set

In [97]:
alias_symbol_sets = mini_ncbi_df.alias_symbol

In [98]:
mini_ncbi_df['alias_symbol'] = mini_ncbi_df['alias_symbol'].astype(str)
mini_ncbi_df['alias_symbol'] = [x.split('|') for x in mini_ncbi_df.alias_symbol]
mini_ncbi_df['alias_symbol']=np.where(mini_ncbi_df.alias_symbol=='','',mini_ncbi_df.alias_symbol.map(set))
mini_ncbi_df.head(1)

Unnamed: 0,gene_symbol,alias_symbol,ENSG ID
75493,12S rRNA,{-},NAN


### Add test for false positives in the intersection points
#### (places where x in alias_symbol matches x in mini_hgnc_df.gene_symbol in the same row)

In [99]:
mini_ncbi_df['gene_symbol'] = [x.split(';') for x in mini_ncbi_df.gene_symbol]
mini_ncbi_df['gene_symbol'] =np.where(mini_ncbi_df.gene_symbol =='','',mini_ncbi_df.gene_symbol.map(set))
false_pos_mini_ncbi_df = mini_ncbi_df[mini_ncbi_df.alias_symbol.apply(lambda x: x & all_gene_symbol_set) == mini_ncbi_df.gene_symbol]
false_pos_mini_ncbi_df.head()

Unnamed: 0,gene_symbol,alias_symbol,ENSG ID


### Find intersection points using alias symbol sets and gene_symbol

In [100]:
mini_ncbi_df['intersect_point'] = mini_ncbi_df.alias_symbol.apply(lambda x: x & all_gene_symbol_set)
true_mini_ncbi_df = mini_ncbi_df[mini_ncbi_df.intersect_point != set()]
true_mini_ncbi_df.head()

Unnamed: 0,gene_symbol,alias_symbol,ENSG ID,intersect_point
11270,{A1CF},"{ACF65, ACF64, ASP, APOBEC1CF, ACF}",ENSG00000148584,{ACF}
18449,{A2ML1},"{CPAMD9, p170, OMS}",ENSG00000166535,{OMS}
9,{AANAT},"{DSPS, SNAT}",ENSG00000129673,{DSPS}
8104,{ABCA7},"{ABCX, ABCA-SSN, AD9}",ENSG00000064687,{AD9}
181,{ABCD1},"{ALDP, ALD, AMN, ABC42}",ENSG00000101986,{AMN}


### Save true intersect points

In [104]:
true_mini_ncbi_df.to_csv('../analysis/true_ncbi_alias_gene_intersections.csv', index=False)
len(true_mini_ncbi_df)

1712