In [1]:
import pandas as pd
import re

We read the csv containing the FANTOM's annotations and we generate a dataframe.

In [2]:
df = pd.read_csv("anno-fantom.csv") #dataframe with FANTOM annotations
df

Unnamed: 0,T000001,p1@LOXL4,84171,17171,Q96JB6,LOXL4,lysyl oxidase like 4,gene with protein product
0,T000006,p1@PYROXD2,84795,23517,Q8N2H3,PYROXD2,pyridine nucleotide-disulphide oxidoreductase ...,gene with protein product
1,T000007,p2@PYROXD2,84795,23517,Q8N2H3,PYROXD2,pyridine nucleotide-disulphide oxidoreductase ...,gene with protein product
2,T000014,p1@HPS1,3257,5163,"Q92902,Q658M9,Q8WXE5",HPS1,"HPS1, biogenesis of lysosomal organelles compl...",gene with protein product
3,T000023,p1@HPSE2,60495,18374,Q8WWQ2,HPSE2,heparanase 2 (inactive),gene with protein product
4,T000024,p3@HPSE2,60495,18374,Q8WWQ2,HPSE2,heparanase 2 (inactive),gene with protein product
...,...,...,...,...,...,...,...,...
87548,T201777,p1@PRKY,5616,9444,,PRKY,"protein kinase, Y-linked, pseudogene",pseudogene
87549,T201778,p4@PRKY,5616,9444,,PRKY,"protein kinase, Y-linked, pseudogene",pseudogene
87550,T201779,p3@PRKY,,9444,O43930,PRKY,"protein kinase, Y-linked, pseudogene",pseudogene
87551,T201797,"p1@TSPY4,p1@TSPY8",728403728395,3747137287,"F8VZD1,A6NGL4,P0CV99,P0CW00,P0CW01",TSPY8,"testis specific protein, Y-linked 8",gene with protein product


The file appears to be without a header. We simply rewrite the column names because we don't need the first line.

In [3]:
df.rename(columns={'T000001':'id', 'p1@LOXL4':'gene', '84171':'n1', '17171':'n2', 'Q96JB6':'n3', 'LOXL4':'n4', 'lysyl oxidase like 4':'descrizione1', 'gene with protein product':'descrizione2'}, inplace=True)
df.drop(['id'], axis=1, inplace=True)

We create a set with the 170 genes most associated with breast cancer (taken from Enrichr).

In [4]:
breast_cancer_genes = set(['ABCB1', 'ABCG2', 'ADIPOQ', 'AHR', 'AKT1', 'ALDH1A1', 'AR', 'ATM', 'AURKA', 'BARD1', 'BCL2', 'BIRC5', 'BMI1',
                       'BRCA1','BRCA2','BRIP1','BRMS1','CA9','CASP8','CAV1','CCAR2','CCL2','CCND1','CCNE1','CD24','CD274','CD44',
                       'CDH1','CDK4','CDK6','CDKN1A','CDKN1B','CDKN2A','CHEK2','COMT','CTNNB1','CXCL12','CXCL8','CXCR4','CYP17A1',
                       'CYP19A1','CYP1A1','CYP1B1','CYP2D6','CYP3A4','DNMT1','E2F1','EGF','EGFR','ELAVL1','EPCAM','ERBB2','ERBB3',
                       'ERBB4','ERCC1','ERCC2','ESR1','ESR2','EZH2','FGFR1','FGFR2','FGFR4','FOXA1','FOXM1','FOXO3','FOXP3','GATA3',
                       'GPER1','GSK3B','GSTM1','GSTP1','GSTT1','HIF1A','HOTAIR','HSD17B1','IGF1','IGF1R','IGFBP3','IL10','IL1B','IL6',
                       'ITGB1','ITGB3','JAK2','JUN','KDR','KRAS','LEP','LEPR','MAPK1','MAPK14','MAPK3','MDM2','MET','MIR146A','MIR155',
                       'MIR200C','MIR21','MIR34A','MKI67','MMP1','MMP14','MMP2','MMP9','MTDH','MTHFR','MTOR','MUC1','MYC','NAT2','NBN',
                       'NCOA3','NFE2L2','NFKB1','NME1','NOTCH1','NQO1','NRG1','PALB2','PARP1','PGR','PIK3CA','PIN1','PLAU','PPARG',
                       'PRLR','PTEN','PTGS2','PTK2','RAC1','RAD51','RAD51C','RASSF1','RB1','RELA','RHOA','RUNX2','SERPINB5','SERPINE1',
                       'SIRT1','SKP2','SNAI1','SNAI2','SOD2','SOX2','SP1','SPP1','SRC','STAT3','SULT1A1','TERT','TGFB1','TNF','TNFSF10',
                       'TNFSF11','TOP2A','TOX3','TP53','TP63','TWIST1','TYMS','VDR','VEGFA','WWTR1','XRCC1','XRCC3','YAP1','YBX1','ZEB1'])

In [5]:
len(breast_cancer_genes)

169

The number of genes linked to breast cancer is 169, but Enrichr first reported that it was 170 since it only permits selection numbers to be multiples of 10.

Now we extract from the dataframe only the genes of interest.

In [6]:
toKeep = []

for index, row in df.iterrows():
    if row.n4 in breast_cancer_genes:
        toKeep.append(row)

df2 = pd.DataFrame(toKeep) 
df2

Unnamed: 0,gene,n1,n2,n3,n4,descrizione1,descrizione2
275,p3@CYP17A1,1586,2593,,CYP17A1,cytochrome P450 family 17 subfamily A member 1,gene with protein product
276,p2@CYP17A1,1586,2593,,CYP17A1,cytochrome P450 family 17 subfamily A member 1,gene with protein product
277,p1@CYP17A1,1586,2593,,CYP17A1,cytochrome P450 family 17 subfamily A member 1,gene with protein product
278,p5@CYP17A1,1586,2593,"P05093,Q1HB44",CYP17A1,cytochrome P450 family 17 subfamily A member 1,gene with protein product
279,p4@CYP17A1,1586,2593,"P05093,Q1HB44",CYP17A1,cytochrome P450 family 17 subfamily A member 1,gene with protein product
...,...,...,...,...,...,...,...
87509,p2@CD24,100133941,1645,,CD24,CD24 molecule,gene with protein product
87510,p6@CD24,100133941,1645,,CD24,CD24 molecule,gene with protein product
87511,p3@CD24,100133941,1645,,CD24,CD24 molecule,gene with protein product
87512,p5@CD24,100133941,1645,,CD24,CD24 molecule,gene with protein product


Seems like almost all, if not all, were found. Let's check:

In [7]:
df2.n4.unique()

array(['CYP17A1', 'FGFR2', 'MKI67', 'BMI1', 'ZEB1', 'ITGB1', 'CXCL12',
       'SIRT1', 'PLAU', 'GATA3', 'PTEN', 'PGR', 'YAP1', 'MMP1', 'ATM',
       'CD44', 'RELA', 'BRMS1', 'GSTP1', 'CCND1', 'IGF1', 'CDKN1B',
       'KRAS', 'FOXM1', 'VDR', 'SP1', 'HOTAIR', 'ERBB3', 'CDK4', 'MDM2',
       'BRCA2', 'TNFSF11', 'RB1', 'XRCC3', 'AKT1', 'MMP14', 'FOXA1',
       'HIF1A', 'ESR2', 'RAD51', 'CYP19A1', 'CYP1A1', 'IGF1R', 'PALB2',
       'SULT1A1', 'MAPK3', 'TOX3', 'MMP2', 'CDH1', 'NQO1', 'CCL2',
       'ERBB2', 'TOP2A', 'STAT3', 'HSD17B1', 'BRCA1', 'ITGB3', 'NME1',
       'RAD51C', 'MIR21', 'BRIP1', 'TP53', 'BIRC5', 'BCL2', 'SERPINB5',
       'TYMS', 'DNMT1', 'CCNE1', 'TGFB1', 'XRCC1', 'ERCC2', 'ERCC1',
       'ELAVL1', 'PIN1', 'GSTM1', 'MTOR', 'MTHFR', 'MUC1', 'PTGS2',
       'IL10', 'PARP1', 'YBX1', 'JUN', 'LEPR', 'E2F1', 'SRC', 'MMP9',
       'NCOA3', 'SNAI1', 'AURKA', 'MIR155', 'COMT', 'MAPK1', 'GSTT1',
       'CHEK2', 'CYP2D6', 'IL1B', 'CXCR4', 'NFE2L2', 'CASP8', 'ERBB4',
       'BARD1', 'C

In [8]:
found_in_fantom = set(df2.n4.unique())
print(f"The genes not found in FANTOM5 are: {', '.join(breast_cancer_genes.difference(found_in_fantom))}")

The genes not found in FANTOM5 are: MIR34A, MIR200C


The final dataframe is saved as a csv file.

In [9]:
df2.index = pd.RangeIndex(len(df2.index))
df2.to_csv("anno-fantom-cancer.csv", index_label='id', encoding='utf-8')