### Import 

In [172]:
import pandas as pd
import numpy as np
df = pd.read_csv("../analysis/hgnc_filtered.csv")

### Drop all columns besides ENSG ID, gene_symbol, and alias_symbol

In [173]:
mini_hgnc_df = df.drop(['Unnamed: 0', 'hgnc_id', 'locus_type', 'name', 'mane_select', 'locus_group', 'entrez_id', 'agr', 'refseq_accession', 'alias_name', 'ENSEMBLtrans', 'NA', 'unknown'], axis=1)
mini_hgnc_df = mini_hgnc_df.rename(columns = {'ensembl_gene_id':'ENSG', 'symbol':'gene_symbol'})
mini_hgnc_df.head(1)

Unnamed: 0,gene_symbol,ENSG,alias_symbol
0,A1BG,ENSG00000121410,


### Make all of gene_symbol a set

In [174]:
all_gene_symbols_set = set(mini_hgnc_df['gene_symbol'])
# all_gene_symbols_set

### Make each row in alias_symbol a set:
    covert to a list 
    make a set

In [175]:
mini_hgnc_df['alias_symbol'] = mini_hgnc_df['alias_symbol'].astype(str)
type(mini_hgnc_df.alias_symbol[2].split(';'))

list

In [176]:
mini_hgnc_df['alias_symbol'] = [x.split(';') for x in mini_hgnc_df.alias_symbol]
mini_hgnc_df.head(1)

Unnamed: 0,gene_symbol,ENSG,alias_symbol
0,A1BG,ENSG00000121410,[nan]


In [177]:

mini_hgnc_df['alias_symbol']=np.where(mini_hgnc_df.alias_symbol =='','',mini_hgnc_df.alias_symbol.map(set))
mini_hgnc_df.head()

Unnamed: 0,gene_symbol,ENSG,alias_symbol
0,A1BG,ENSG00000121410,{nan}
1,A1BG-AS1,ENSG00000268895,{FLJ23569}
2,A1CF,ENSG00000148584,"{ACF, ACF65, ASP, APOBEC1CF, ACF64}"
3,A2M,ENSG00000175899,"{FWP007, CPAMD5, S863-7}"
4,A2M-AS1,ENSG00000245105,{nan}


### Add test for false positives in the intersection points(places where x in alias_smbol matches x in mini_hgnc_df.gene_symbol in the same row)

In [178]:
mini_hgnc_df['gene_symbol'] = [x.split(';') for x in mini_hgnc_df.gene_symbol]
mini_hgnc_df['gene_symbol'] =np.where(mini_hgnc_df.gene_symbol =='','',mini_hgnc_df.gene_symbol.map(set))
false_pos_mini_hgnc_df = mini_hgnc_df[mini_hgnc_df.alias_symbol.apply(lambda x: x & all_gene_symbols_set) == mini_hgnc_df.gene_symbol]
false_pos_mini_hgnc_df

Unnamed: 0,gene_symbol,ENSG,alias_symbol
351,{ACTRT2},ENSG00000169717,"{Arp-T2, ARPM2, FLJ25424, ACTRT2}"


### Need to remove aliases that match their primary key (gene symbol)

In [179]:
mini_hgnc_df['alias_symbol'] = mini_hgnc_df.alias_symbol - mini_hgnc_df.gene_symbol
display(mini_hgnc_df.iloc[351])

gene_symbol                      {ACTRT2}
ENSG                      ENSG00000169717
alias_symbol    {Arp-T2, ARPM2, FLJ25424}
Name: 351, dtype: object

#### Find intersection points using new alias symbol sets

In [None]:
alias_symbol_sets_series = mini_hgnc_df.alias_symbol

In [None]:
mini_hgnc_df['intersect_point'] = mini_hgnc_df.alias_symbol.apply(lambda x: x & all_gene_symbols_set)
true_mini_hgnc_df = mini_hgnc_df[mini_hgnc_df.intersect_point != set()]
true_mini_hgnc_df

Unnamed: 0,gene_symbol,ENSG,alias_symbol,intersect_point
87,{ABCD1},ENSG00000101986,"{ALDP, AMN, adrenoleukodystrophy}",{AMN}
206,{ACKR2},ENSG00000144648,"{CCR9, CCR10, D6}","{CCR9, CCR10}"
215,{ACOD1},ENSG00000102794,{CAD},{CAD}
275,{ACTBP8},ENSG00000220267,{ACTBP2},{ACTBP2}
399,{ADAM28},ENSG00000042980,"{ADAM23, eMDCII, MDC-Lm, MDC-Ls}",{ADAM23}
...,...,...,...,...
41595,{VSIG2},ENSG00000019102,"{CTXL, CTH}",{CTH}
42442,{ZNF83},ENSG00000167766,"{HPF1, FLJ11015}",{HPF1}
42461,{ZNF106},ENSG00000103994,"{ZNF474, SH3BP3}",{ZNF474}
42468,{ZNF121},ENSG00000197961,"{ZNF20, ZHC32}",{ZNF20}


#### Save true intersect points

In [None]:
true_mini_hgnc_df.to_csv('../analysis/true_hgnc_alias_gene_intersections.csv', index=False)