In [28]:
import pandas as pd
def read_HPA(tsv_file):
    '''
    extract gene symbols from human protein altas
    
    for example: 
    input: protein_class_Blood.tsv
    output: ~20 blood group antigen proteins's gene symbol
    '''
    base = '~/human_protein_atlas/'
    df = pd.read_csv(base+tsv_file, header = 0, index_col = 0, sep = '\t')
    return(df.index.values)

In [30]:
antigen = read_HPA('protein_class_Blood.tsv')

In [32]:
cd = read_HPA('protein_class_CD.tsv')

In [41]:
cd

array(['ABCB1', 'ABCG2', 'ACE', 'ACKR1', 'ADAM10', 'ADAM17', 'ADAM8',
       'ADGRE2', 'ADGRE5', 'ALCAM', 'ALK', 'ANPEP', 'ART1', 'ART4',
       'ATP1B3', 'BCAM', 'BMPR1A', 'BMPR1B', 'BSG', 'BST1', 'BST2',
       'BTLA', 'BTN3A1', 'C5AR1', 'CCR1', 'CCR2', 'CCR3', 'CCR4', 'CCR5',
       'CCR6', 'CCR7', 'CCR8', 'CCR9', 'CD101', 'CD109', 'CD14', 'CD151',
       'CD160', 'CD163', 'CD163L1', 'CD164', 'CD180', 'CD19', 'CD1A',
       'CD1B', 'CD1C', 'CD1D', 'CD1E', 'CD2', 'CD200', 'CD207', 'CD209',
       'CD22', 'CD226', 'CD244', 'CD247', 'CD248', 'CD27', 'CD274',
       'CD276', 'CD28', 'CD300A', 'CD300C', 'CD300E', 'CD300LB',
       'CD300LD', 'CD300LF', 'CD300LG', 'CD302', 'CD320', 'CD33', 'CD34',
       'CD36', 'CD37', 'CD38', 'CD3D', 'CD3E', 'CD3G', 'CD4', 'CD40',
       'CD40LG', 'CD44', 'CD46', 'CD47', 'CD48', 'CD5', 'CD52', 'CD53',
       'CD55', 'CD58', 'CD59', 'CD6', 'CD63', 'CD68', 'CD69', 'CD7',
       'CD70', 'CD72', 'CD74', 'CD79A', 'CD79B', 'CD80', 'CD81', 'CD82',
       'CD83

In [39]:
def housekeeping(file = '/home/hsher/tnbc_scrnaseq/data/housekeepers.txt'):
    '''
    retrun housekeeping gene from a https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5465819/
    downloaded from  downloaded from https://github.com/Michorlab/tnbc_scrnaseq
    
    input: filename
    output: list of gene symbols that are house keeping (98 genes in total)
    
    '''
    df = pd.read_csv(file, header = None)
    return(df[0].values)
    

In [40]:
housekeeping()

array(['ACTB', 'B2M', 'HNRPLL', 'HPRT', 'PSMB2', 'PSMB4', 'PPIA', 'PRPS1',
       'PRPS1L1', 'PRPS1L3', 'PRPS2', 'PRPSAP1', 'PRPSAP2', 'RPL10',
       'RPL10A', 'RPL10L', 'RPL11', 'RPL12', 'RPL13', 'RPL14', 'RPL15',
       'RPL17', 'RPL18', 'RPL19', 'RPL21', 'RPL22', 'RPL22L1', 'RPL23',
       'RPL24', 'RPL26', 'RPL27', 'RPL28', 'RPL29', 'RPL3', 'RPL30',
       'RPL32', 'RPL34', 'RPL35', 'RPL36', 'RPL37', 'RPL38', 'RPL39',
       'RPL39L', 'RPL3L', 'RPL4', 'RPL41', 'RPL5', 'RPL6', 'RPL7',
       'RPL7A', 'RPL7L1', 'RPL8', 'RPL9', 'RPLP0', 'RPLP1', 'RPLP2',
       'RPS10', 'RPS11', 'RPS12', 'RPS13', 'RPS14', 'RPS15', 'RPS15A',
       'RPS16', 'RPS17', 'RPS18', 'RPS19', 'RPS20', 'RPS21', 'RPS24',
       'RPS25', 'RPS26', 'RPS27', 'RPS27A', 'RPS27L', 'RPS28', 'RPS29',
       'RPS3', 'RPS3A', 'RPS4X', 'RPS5', 'RPS6', 'RPS6KA1', 'RPS6KA2',
       'RPS6KA3', 'RPS6KA4', 'RPS6KA5', 'RPS6KA6', 'RPS6KB1', 'RPS6KB2',
       'RPS6KC1', 'RPS6KL1', 'RPS7', 'RPS8', 'RPS9', 'RPSA', 'TRPS1',
       'UB

In [59]:
def read_haemapedia(species = 'human'):
    '''
    read haemopedia results, return dataframe with Gene symbol + Lineage as columns
    species = 'human' or 'murine'
    
    Return: dataframe, columns = ['Gene Symbol', 'Lineage']
    possible values in Lineage 'Multi Potential Progenitor', 'Erythrocyte Lineage',
       'Megakaryocyte Lineage', 'Basophil Lineage', 'Eosinophil Lineage',
       'Neutrophil Lineage', 'Macrophage Lineage',
       'Dendritic Cell Lineage', 'B Cell Lineage', 'T Cell Lineage',
       'NK Cell Lineage']
    '''
    if species == 'human':
        df = pd.read_excel('~/haemopedia/lineage_specific_genes_in_human.xlsx')
        df = df[['Human Gene Symbol', 'Lineage']]
        
    else:
        df = pd.read_excel('~/haemopedia/lineage_specific_gene.xlsx')
        df = df[['GeneSymbol', 'Lineage']]
    
    # rename columns
    df.columns = ['Gene Symbol', 'Lineage']
    return df
    

In [60]:
dk = read_haemapedia()

In [64]:
dk.head()

Unnamed: 0,Gene Symbol,Lineage
0,EGFL7,Multi Potential Progenitor
1,DNMT3B,Multi Potential Progenitor
2,RPL22,Multi Potential Progenitor
3,SOX4,Multi Potential Progenitor
4,DCTD,Multi Potential Progenitor


In [62]:
d = read_haemapedia(species = 'mice')

In [65]:
d.head()

Unnamed: 0,Gene Symbol,Lineage
0,Il11ra1,Multi Potential Progenitor
1,Adgrl4,Multi Potential Progenitor
2,Egfl7,Multi Potential Progenitor
3,Dnmt3b,Multi Potential Progenitor
4,Rpl22,Multi Potential Progenitor


In [74]:
def human_mouse_homolog(filename = '~/HMD_HumanPhenotype.rpt'):
    df = pd.read_csv(filename, header = None, sep = '\t')
    
    # save only useful information
    df = df[[0,4]]
    df.columns = ['Human', 'Mouse']
    return(df)

In [75]:
human_mouse_homolog()

Unnamed: 0,Human,Mouse
0,A1BG,A1bg
1,A1CF,A1cf
2,A2M,A2m
3,A3GALT2,A3galt2
4,A4GALT,A4galt
...,...,...
18749,ZYG11A,Zyg11a
18750,ZYG11B,Zyg11b
18751,ZYX,Zyx
18752,ZZEF1,Zzef1
