### Import 

In [20]:
import pandas as pd
import numpy as np
df = pd.read_csv("../analysis/ncbi_GRCh38.p14_filtered_xrefsplit.csv")

### Drop all columns besides gene_symbol, and alias_symbol

In [21]:
mini_ncbiGRCh38_df = df.drop(['Unnamed: 0', 'gene', 'xref_IMGT_gene_db', 'xref_key_not_found', 'xref_hgncID', 'xref_mimID', 'xref_miRBaseID', 'xref_ncbiID', 'description', 'name'], axis=1)
mini_ncbiGRCh38_df = mini_ncbiGRCh38_df.rename(columns = {'gene_id':'gene_symbol','aliases':'alias_symbol'})
mini_ncbiGRCh38_df['gene_symbol'] = mini_ncbiGRCh38_df['gene_symbol'].str.replace('gene-', '')
mini_ncbiGRCh38_df.head(1)

Unnamed: 0,gene_symbol,alias_symbol
0,DDX11L1,


### Make all of gene_symbol a set

In [22]:
all_gene_symbol_set = set(mini_ncbiGRCh38_df['gene_symbol'])

### Make each row in alias_symbol a set:
    covert to a list 
    make a set

In [23]:
mini_ncbiGRCh38_df['alias_symbol'] = mini_ncbiGRCh38_df['alias_symbol'].astype(str)
mini_ncbiGRCh38_df['alias_symbol'] = [x.split(';') for x in mini_ncbiGRCh38_df.alias_symbol]
mini_ncbiGRCh38_df['alias_symbol']=np.where(mini_ncbiGRCh38_df.alias_symbol=='','',mini_ncbiGRCh38_df.alias_symbol.map(set))
mini_ncbiGRCh38_df.head(1)

Unnamed: 0,gene_symbol,alias_symbol
0,DDX11L1,{nan}


### Add test for false positives in the intersection points
#### (places where x in alias_smbol matches x in mini_hgnc_df.gene_symbol in the same row)

In [24]:
mini_ncbiGRCh38_df['gene_symbol'] = [x.split(';') for x in mini_ncbiGRCh38_df.gene_symbol]
mini_ncbiGRCh38_df['gene_symbol'] =np.where(mini_ncbiGRCh38_df.gene_symbol =='','',mini_ncbiGRCh38_df.gene_symbol.map(set))
false_pos_mini_ncbiGRCh38_df = mini_ncbiGRCh38_df[mini_ncbiGRCh38_df.alias_symbol.apply(lambda x: x & all_gene_symbol_set) == mini_ncbiGRCh38_df.gene_symbol]
false_pos_mini_ncbiGRCh38_df

Unnamed: 0,gene_symbol,alias_symbol
43015,{MIR3690-2},"{hsa-mir-3690-1, hsa-mir-3690-2, mir-3690-1, M..."
43027,{MIR6089-2},"{hsa-mir-6089-2, hsa-mir-6089-1, MIR6089-1, MI..."
44999,{CHRNA7-2},"{NACHRA7, CHRNA7-2}"
47573,{GABBR1-3},"{GABBR1-3, GB1, GABABR1, GPRC3A}"


### Need to remove aliases that match their primary key (gene symbol)

In [25]:
mini_ncbiGRCh38_df['alias_symbol'] = mini_ncbiGRCh38_df.alias_symbol - mini_ncbiGRCh38_df.gene_symbol
mini_ncbiGRCh38_df.head(1)

Unnamed: 0,gene_symbol,alias_symbol
0,{DDX11L1},{nan}


#### Find intersection points using new alias symbol sets

In [26]:
mini_ncbiGRCh38_df['intersect_point'] = mini_ncbiGRCh38_df.alias_symbol.apply(lambda x: x & all_gene_symbol_set)
true_mini_ncbiGRCh38_df = mini_ncbiGRCh38_df[mini_ncbiGRCh38_df.intersect_point != set()]
true_mini_ncbiGRCh38_df.head(1)

Unnamed: 0,gene_symbol,alias_symbol,intersect_point
1,{WASH7P},"{FAM39F, WASH5P}",{WASH5P}


#### Save true intersect points

In [27]:
true_mini_ncbiGRCh38_df.to_csv('../analysis/true_ncbiGRCh38_alias_gene_intersections.csv', index=False)
len(true_mini_ncbiGRCh38_df)

1745