# Task 1

In [1]:
import pandas as pd

In [2]:
#Path to the gene information file
gene_info_file_path = "/content/Homo_sapiens.gene_info.gz"

#  read the tab  delimiter  file witin the type of the compression (gzip)
gene_info_df = pd.read_csv(gene_info_file_path, sep='\t', compression='gzip')

In [3]:
# show part of the gene information file
gene_info_df.head(4)

Unnamed: 0,#tax_id,GeneID,Symbol,LocusTag,Synonyms,dbXrefs,chromosome,map_location,description,type_of_gene,Symbol_from_nomenclature_authority,Full_name_from_nomenclature_authority,Nomenclature_status,Other_designations,Modification_date,Feature_type
0,9606,1,A1BG,-,A1B|ABG|GAB|HYST2477,MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410...,19,19q13.43,alpha-1-B glycoprotein,protein-coding,A1BG,alpha-1-B glycoprotein,O,alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...,20230621,-
1,9606,2,A2M,-,A2MD|CPAMD5|FWP007|S863-7,MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899...,12,12p13.31,alpha-2-macroglobulin,protein-coding,A2M,alpha-2-macroglobulin,O,alpha-2-macroglobulin|C3 and PZP-like alpha-2-...,20230621,-
2,9606,3,A2MP1,-,A2MP,HGNC:HGNC:8|Ensembl:ENSG00000291190|AllianceGe...,12,12p13.31,alpha-2-macroglobulin pseudogene 1,pseudo,A2MP1,alpha-2-macroglobulin pseudogene 1,O,pregnancy-zone protein pseudogene,20230329,-
3,9606,9,NAT1,-,AAC1|MNAT|NAT-1|NATI,MIM:108345|HGNC:HGNC:7645|Ensembl:ENSG00000171...,8,8p22,N-acetyltransferase 1,protein-coding,NAT1,N-acetyltransferase 1,O,arylamine N-acetyltransferase 1|N-acetyltransf...,20230621,-


In [4]:
# get the required columns for the investigation
gene_info_df = gene_info_df[["GeneID","Symbol","Synonyms"]]
gene_info_df.head(4)

Unnamed: 0,GeneID,Symbol,Synonyms
0,1,A1BG,A1B|ABG|GAB|HYST2477
1,2,A2M,A2MD|CPAMD5|FWP007|S863-7
2,3,A2MP1,A2MP
3,9,NAT1,AAC1|MNAT|NAT-1|NATI


In [5]:
# separate items in the Synonyms columns to a list
gene_info_df["Synonyms"] = gene_info_df["Synonyms"].str.split("|")
gene_info_df.head(3)


Unnamed: 0,GeneID,Symbol,Synonyms
0,1,A1BG,"[A1B, ABG, GAB, HYST2477]"
1,2,A2M,"[A2MD, CPAMD5, FWP007, S863-7]"
2,3,A2MP1,[A2MP]


In [6]:
# separate items in the Synonyms columns to new rows with same GeneID
gene_info_df = gene_info_df.explode('Synonyms')

# explore the first GeneID with its Symbol and Synonyms
gene_info_df.head(6)

Unnamed: 0,GeneID,Symbol,Synonyms
0,1,A1BG,A1B
0,1,A1BG,ABG
0,1,A1BG,GAB
0,1,A1BG,HYST2477
1,2,A2M,A2MD
1,2,A2M,CPAMD5


In [7]:
# mapping the Symbol and GeneID columns
map_symbol = dict(zip(gene_info_df.Symbol, gene_info_df.GeneID))

# mapping the Synonyms and GeneID columns
map_synonyms = dict(zip(gene_info_df.Synonyms, gene_info_df.GeneID))

# concatonate betwwn two dictionaries
map_symbol_synonyms = {}
map_symbol_synonyms.update(map_synonyms)
map_symbol_synonyms.update(map_symbol)
len(map_symbol_synonyms)

258155

In [8]:
def get_geneid(gene_name, gene_dict):
    """
    Function to get the geneID from the name of the gene (symbol or synonym)
    using a dictionary.

    Input:
       gene_name (str): The name of the gene to retrieve the geneID.
       gene_dict (dict): The dictionary containing genes as keys and
                          their corresponding geneIDs as values.

    Output:
    geneid (str or None): The geneID corresponding to the gene_name, or Gene not found
    if the gene_name is not found in the dictionary.
    """

    geneid = gene_dict.get(gene_name)
    return geneid


In [9]:
get_geneid("HYST2477",map_symbol_synonyms)

1

# Task 2

In [10]:
gmt_file_path = "/content/h.all.v2023.1.Hs.symbols.gmt"

pathways_list = []

# read the file line by line
with open(gmt_file_path, 'r') as file:
    for line in file:
        pathway_info = line.strip().split('\t')

        # Separate the pathway information for each row
        pathway_name = pathway_info[0]
        pathway_description = pathway_info[1]
        gene_names = pathway_info[2:]  # All subsequent values are gene names

        # Append the each pathway info to the list
        pathways_list.append([pathway_name, pathway_description] + gene_names)

# Create the DataFrame from the list of pathway_data_list
columns = ['Pathway Name', 'Pathway Description'] + [f'Gene_{i}' for i in range(1, len(pathways_list[0]) - 1)]
pathways_df = pd.DataFrame(pathways_list, columns=columns)
pathways_df.head(3)

Unnamed: 0,Pathway Name,Pathway Description,Gene_1,Gene_2,Gene_3,Gene_4,Gene_5,Gene_6,Gene_7,Gene_8,...,Gene_191,Gene_192,Gene_193,Gene_194,Gene_195,Gene_196,Gene_197,Gene_198,Gene_199,Gene_200
0,HALLMARK_TNFA_SIGNALING_VIA_NFKB,http://www.gsea-msigdb.org/gsea/msigdb/human/g...,JUNB,CXCL2,ATF3,NFKBIA,TNFAIP3,PTGS2,CXCL1,IER3,...,EIF1,BMP2,DUSP4,PDLIM5,ICOSLG,GFPT2,KLF2,TNC,SERPINB8,MXD1
1,HALLMARK_HYPOXIA,http://www.gsea-msigdb.org/gsea/msigdb/human/g...,PGK1,PDK1,GBE1,PFKL,ALDOA,ENO2,PGM1,NDRG1,...,HDLBP,ILVBL,NCAN,TGM2,ETS1,HOXB9,SELENBP1,FOSL2,SULT2B1,TGFB3
2,HALLMARK_CHOLESTEROL_HOMEOSTASIS,http://www.gsea-msigdb.org/gsea/msigdb/human/g...,FDPS,CYP51A1,IDI1,FDFT1,DHCR7,SQLE,HMGCS1,NSDHL,...,,,,,,,,,,


In [11]:
# Replace the gene names with (Gene ID) Entrez ID from Task 1
# apply the get_geneid function to replace the gene names with their ID

pathways_df.iloc[:, 2:] = pathways_df.iloc[:, 2:].applymap(
                                          lambda x: get_geneid(x, map_symbol_synonyms)).fillna('')
pathways_df.head(3)


Unnamed: 0,Pathway Name,Pathway Description,Gene_1,Gene_2,Gene_3,Gene_4,Gene_5,Gene_6,Gene_7,Gene_8,...,Gene_191,Gene_192,Gene_193,Gene_194,Gene_195,Gene_196,Gene_197,Gene_198,Gene_199,Gene_200
0,HALLMARK_TNFA_SIGNALING_VIA_NFKB,http://www.gsea-msigdb.org/gsea/msigdb/human/g...,3726,2920,467,4792,7128,5743,2919,8870,...,10209.0,650.0,1846.0,10611.0,23308.0,9945.0,10365.0,3371.0,5271.0,4084.0
1,HALLMARK_HYPOXIA,http://www.gsea-msigdb.org/gsea/msigdb/human/g...,5230,5163,2632,5211,226,2026,5236,10397,...,3069.0,10994.0,1463.0,7052.0,2113.0,3219.0,8991.0,2355.0,6820.0,7043.0
2,HALLMARK_CHOLESTEROL_HOMEOSTASIS,http://www.gsea-msigdb.org/gsea/msigdb/human/g...,2224,1595,3422,2222,1717,6713,3157,50814,...,,,,,,,,,,


In [12]:
# Save the output to new gmt file

output_file = 'output_file.gmt'
pathways_df.to_csv(output_file, sep='\t', index=False, header=False)