In [18]:
import csv
import pandas as pd
pd.set_option('display.max_columns', 500)

import re
from typing import List, Any, Union

%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt

In [19]:
hgnc_title_list: List[str] = ['HGNC ID', 'Approved Name', 'Status',
                   'Previous Symbols', 'Synonyms', 'Chromosone', 'Accession Numbers',
                   'RefSeq IDs', 'OMIM ID', 'Ensembl ID', 'UCSC ID']
cosmic_title_list = ['Gene name', 'Accession Number', 'Gene', 'CDS length', 'HGNC ID',
                     'Sample name', 'ID_sample', 'ID_tumour', 'Primary site',
                     'Site subtype 1', 'Site subtype 2', 'Site subtype 3',
                     'Primary histology', 'Histology subtype 1',
                     'Histology subtype 2', 'Histology subtype 3',
                     'Genome-wide screen', 'Mutation ID', 'Mutation CDS', 'Mutation',
                     'AA Mutation Description', 'Mutation zygosity', 'LOH', 'GRCh',
                     'Mutation genome position', 'Mutation strand SNP', 'Resistance',
                     'Mutation', 'FATHMM prediction', 'FATHMM score',
                     'Mutation somatic status', 'Pubmed_PMID', 'ID_STUDY',
                     'Sample Type', 'Tumour']

hgnc_location = 'hgnc.txt'
cosmic_location = 'CosmicMutantExport.tsv'

In [20]:
def get_df(file_location):
    return pd.read_csv(file_location, sep='\t', low_memory=False)

def aggregate(df, col_name):
    return df.groupby(col_name).agg(lambda x: list(x))


In [21]:
cosmic_df = get_df(cosmic_location)
cosmic_df.to_pickle('cosmic_pkl')

In [22]:
# temp_df = cosmic_df[:50]
# temp_df.to_pickle('temp_pkl')

temp_df = cosmic_df 

In [23]:
gene_df = aggregate(temp_df, 'Gene name')
gene_df.to_pickle('gene_pkl')

In [24]:
mutation_aa = aggregate(cosmic_df, 'Mutation AA')
mutation_aa.to_pickle('mutation_aa_pkl')

In [25]:
mutation_cds = aggregate(cosmic_df, 'Mutation CDS')
mutation_cds.to_pickle('mutation_cds_pkl')

In [27]:
hgnc_df = aggregate(get_df(hgnc_location), 'HGNC ID') 
hgnc_df.reset_index(level=0, inplace=True, drop=False)
hgnc_df.to_pickle('hgnc_pkl')