In [1]:
!pip install cptac
!pip install xlsxwriter



In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import cptac
import xlsxwriter

In [65]:
def get_sig_genes(data):
  q25, q75 = np.quantile(data['high'], [0.25, 0.75])
  average_up = q75 + ((q75 - q25) * 1.5 )
  q25, q75 = np.quantile(data['low'], [0.25, 0.75])
  average_down = q75 + ((q75 - q25) * 1.5 )
  data_up = data['high'] > average_up
  data_down = data['low'] > average_down
  data_sig = data_up | data_down
  sig_genes = data[data_sig]
  return data[data_up], data[data_down], average_up, average_down

## Download all Datasets

In [4]:
cptac.download(dataset="endometrial")
en = cptac.Endometrial()
en_cnv = en.get_CNV()

                                    

In [5]:
cptac.download(dataset="gbm")
gbm = cptac.Gbm()
gbm_cnv = gbm.get_CNV()

                                    



In [6]:
cptac.download(dataset="brca")
brca = cptac.Brca()
brca_cnv = brca.get_CNV()

Downloading data files (1/8)...     



                                    

In [7]:
cptac.download(dataset="hnscc")
hnscc = cptac.Hnscc()
hnscc_cnv = hnscc.get_CNV()

Password: ········                  
                                    

In [8]:
cptac.download(dataset="luad")
luad = cptac.Luad()
luad_cnv = luad.get_CNV()

                                    



In [9]:
cptac.download(dataset="ovarian")
ovarian = cptac.Ovarian()
ovarian_cnv = ovarian.get_CNV()

Downloading data files (1/9)...     



                                    

In [10]:
cptac.download(dataset="ccrcc")
ccrcc = cptac.Ccrcc()
ccrcc_cnv = ccrcc.get_CNV()

Downloading data files (1/12)...    



                                    

# Read in Uniprot Data

In [59]:
uniprot_chromosomes = pd.read_csv("uniprot_chromosomes.csv")
uniprot_chromosomes = uniprot_chromosomes.drop(['Unnamed: 0'], axis=1)
uniprot_chromosomes['chromosome'] = uniprot_chromosomes['chromosome'].apply(lambda x: x.replace("['", '').replace("']", ''))
uniprot_chromosomes = uniprot_chromosomes.set_index('gene')

## Get Counts and Average

In [13]:
def separate(hiCut, lowCut, data):
  sepDict = dict()
  for column in data:
    numHi = 0
    numLow = 0
    numNeu = 0
    for i in data[column]:
      if i > hiCut:
        numHi += 1
      elif i < lowCut:
        numLow += 1
      else:
        numNeu += 1 
    sepDict[column] = [numHi, numNeu, numLow]
  return sepDict

In [14]:
def get_counts_and_average(data):
  df = pd.DataFrame(data = separate(.3,-.2,data), index = ["high", "neutral", "low"])
  df = df.append(pd.DataFrame(data.mean(0), columns=['average']).transpose(), sort=True)
  return df

In [15]:
en_counts = get_counts_and_average(en_cnv)
gbm_counts = get_counts_and_average(gbm_cnv)
brca_counts = get_counts_and_average(brca_cnv)
hnscc_counts = get_counts_and_average(hnscc_cnv)
luad_counts = get_counts_and_average(luad_cnv)
ovarian_counts = get_counts_and_average(ovarian_cnv)
ccrcc_counts = get_counts_and_average(ccrcc_cnv)

In [80]:
brca_counts.transpose().to_csv("brca_counts.csv")

## Get Significant List for each cancer type

In [66]:
en_sig_genes_up, en_sig_genes_down, en_up_cutoff, en_down_cutoff = get_sig_genes(en_counts.transpose())

8.5
17.0


In [67]:
gbm_sig_genes_up, gbm_sig_genes_down, gbm_up_cutoff, gbm_down_cutoff = get_sig_genes(gbm_counts.transpose())

10.0
19.5


In [68]:
brca_sig_genes_up, brca_sig_genes_down, brca_up_cutoff, brca_down_cutoff = get_sig_genes(brca_counts.transpose())

41.0
54.5


In [69]:
hnscc_sig_genes_up, hnscc_sig_genes_down, hnscc_up_cutoff, hnscc_down_cutoff = get_sig_genes(hnscc_counts.transpose())

19.5
39.0


In [70]:
luad_sig_genes_up, luad_sig_genes_down, luad_up_cutoff, luad_down_cutoff = get_sig_genes(luad_counts.transpose())

14.5
44.0


In [71]:
ovarian_sig_genes_up, ovarian_sig_genes_down, ovarian_up_cutoff, ovarian_down_cutoff = get_sig_genes(ovarian_counts.transpose())

42.5
75.0


In [72]:
ccrcc_sig_genes_up, ccrcc_sig_genes_down, ccrcc_up_cutoff, ccrcc_down_cutoff = get_sig_genes(ccrcc_counts.transpose())

10.0
22.0


In [76]:
cutoffs = pd.DataFrame({'Above': [en_up_cutoff, gbm_up_cutoff, brca_up_cutoff, hnscc_up_cutoff, luad_up_cutoff, ovarian_up_cutoff, ccrcc_up_cutoff],
            'Below': [en_down_cutoff, gbm_down_cutoff, brca_down_cutoff, hnscc_down_cutoff, luad_down_cutoff, ovarian_down_cutoff, ccrcc_down_cutoff]},
                      index = ['ENDO', 'GBM', 'BRCA', 'HNSCC', 'LUAD', 'OVARIAN', 'CCRCC'])
cutoffs.to_csv("cutoffs_table.csv")

         Above  Below
ENDO       8.5   17.0
GBM       10.0   19.5
BRCA      41.0   54.5
HNSCC     19.5   39.0
LUAD      14.5   44.0
OVARIAN   42.5   75.0
CCRCC     10.0   22.0


## Get Significant Insertions across Cancer Types

In [23]:
def CountFrequency(my_list): 
      
    # Creating an empty dictionary  
    freq = {} 
    for items in my_list: 
        freq[items] = my_list.count(items) 

    sig = []
      
    for key, value in freq.items(): 
      if value > 2:
        sig.append(key)
    return sig

In [24]:
sig_genes_up = CountFrequency(list(gbm_sig_genes_up.index) + list(hnscc_sig_genes_up.index) + list(luad_sig_genes_up.index) + list(ovarian_sig_genes_up.index) + list(ccrcc_sig_genes_up.index) + list(brca_sig_genes_up.index) + list(en_sig_genes_up.index))

ABHD16B: 3
ADRM1: 3
APCDD1L: 3
ARFGAP1: 3
ARFRP1: 4
BHLHE23: 3
BIRC7: 3
CABLES2: 4
CCT6A: 3
CDH26: 3
CHCHD2: 3
CHRNA4: 3
COL20A1: 3
COL9A3: 3
CTSZ: 3
DIDO1: 3
DNAJC5: 3
EDN3: 3
EEF1A2: 3
EGFR: 3
FAM217B: 3
FNDC11: 3
GATA5: 3
GID8: 3
GMEB2: 4
GNAS: 3
HELZ2: 4
HRH3: 3
LAMA5: 3
LANCL2: 3
LIME1: 4
LKAAEAR1: 3
LSM14B: 3
MDM4: 3
MRGBP: 3
MRPS17: 3
MTG2: 3
NELFCD: 3
NKAIN4: 3
NPEPL1: 3
NUPR2: 3
OGFR: 3
OSBPL2: 3
PHKG1: 3
PIK3C2B: 3
PLEKHA6: 3
PPDPF: 3
PPP1R15B: 3
PPP1R3D: 3
PRPF6: 3
PSMA7: 3
PSPH: 3
PTK6: 3
RBBP8NL: 3
RGS19: 3
RPS21: 4
RTEL1: 4
SAMD10: 4
SEC61G: 3
SEPT14: 3
SLC2A4RG: 4
SOX18: 3
SRMS: 4
SS18L1: 3
STMN3: 4
STX16: 3
SUMF2: 3
SYCP2: 3
TCEA2: 3
TCFL5: 3
TNFRSF6B: 4
TPD52L2: 3
TUBB1: 3
UCKL1: 3
VOPP1: 3
VSTM2A: 3
YTHDF1: 3
ZBTB46: 4
ZGPAT: 4
ZNF217: 3
ZNF512B: 4
ZNF713: 3
ZNF831: 3
AARD: 4
ABRA: 3
ADCY8: 4
AGO2: 3
ANGPT1: 3
ANKRD46: 4
ANXA13: 4
ARC: 3
ASAP1: 4
ATAD2: 4
ATP6V1C1: 4
AZIN1: 4
BAALC: 4
BOP1: 3
C8orf33: 4
C8orf37: 4
C8orf76: 4
C8orf82: 3
C8orf89: 3
CCDC166: 3
CCNE2: 3
C

In [25]:
sig_genes_down = CountFrequency(list(gbm_sig_genes_down.index) + list(hnscc_sig_genes_down.index) + list(luad_sig_genes_down.index) + list(ovarian_sig_genes_down.index) + list(ccrcc_sig_genes_down.index) + list(brca_sig_genes_down.index) + list(en_sig_genes_down.index))

BPY2: 4
BPY2B: 4
BPY2C: 4
CDKN2A: 3
CDKN2B: 3
CDY1: 4
CDY1B: 4
DAZ1: 4
DAZ2: 4
DAZ3: 4
DAZ4: 4
DDX3Y: 4
EIF1AY: 4
TBL1Y: 4
TGIF2LY: 4
TSPY1: 4
TSPY10: 4
TSPY3: 4
TSPY4: 4
TSPY8: 4
USP9Y: 4
AMELY: 3
CDY2A: 3
CDY2B: 3
HSFY1: 3
HSFY2: 3
KDM5D: 3
NLGN4Y: 3
PCDH11Y: 3
PRORY: 3
PRY: 3
PRY2: 3
RBMY1A1: 3
RBMY1B: 3
RBMY1D: 3
RBMY1E: 3
RBMY1F: 3
RBMY1J: 3
RPS4Y1: 3
RPS4Y2: 3
SRY: 3
TMSB4Y: 3
TSPY2: 3
UTY: 3
VCY: 3
VCY1B: 3
ZFY: 3


Add Type row to Counts

In [26]:
def get_count_type(row, up, down):
  gene = row.name
  if gene in up.index:
    return "high" # High
  elif gene in down.index:
    return "low" #Low
  else:
    return "normal" #Normal

In [27]:
en_counts_t = en_counts.transpose()
en_counts_t['ENDO'] = en_counts_t.apply(lambda row: get_count_type(row, en_sig_genes_up, en_sig_genes_down), axis=1)

In [28]:
gbm_counts_t = gbm_counts.transpose()
gbm_counts_t['GBM'] = gbm_counts_t.apply(lambda row: get_count_type(row, gbm_sig_genes_up, gbm_sig_genes_down), axis=1)

In [29]:
brca_counts_t = brca_counts.transpose()
brca_counts_t['BRCA'] = brca_counts_t.apply(lambda row: get_count_type(row, brca_sig_genes_up, brca_sig_genes_down), axis=1)


In [30]:
filter1 = brca_counts_t.transpose().columns.get_level_values(level=0).isin(brca_sig_genes_up.index.get_level_values(level=0))
brca_filtered_counts = brca_counts_t.transpose().loc[:, filter1]
filter2 = brca_counts_t.transpose().columns.get_level_values(level=0).isin(brca_sig_genes_down.index.get_level_values(level=0))
brca_filtered_counts_down = brca_counts_t.transpose().loc[:, filter2]
brca_filtered_counts = brca_filtered_counts.transpose().append(brca_filtered_counts_down.transpose())

In [32]:
brca_filtered_counts_index = brca_filtered_counts.index.droplevel(level=1)
brca_filtered_counts.index = brca_filtered_counts_index
brca_filtered_counts_t = brca_filtered_counts.loc[~brca_filtered_counts.index.duplicated(keep='first')]

In [34]:
hnscc_counts_t = hnscc_counts.transpose()
hnscc_counts_t['HNSCC'] = hnscc_counts_t.apply(lambda row: get_count_type(row, hnscc_sig_genes_up, hnscc_sig_genes_down), axis=1)

In [35]:
luad_counts_t = luad_counts.transpose()
luad_counts_t['LUAD'] = luad_counts_t.apply(lambda row: get_count_type(row, luad_sig_genes_up, luad_sig_genes_down), axis=1)

In [36]:
ovarian_counts_t = ovarian_counts.transpose()
ovarian_counts_t['OVARIAN'] = ovarian_counts_t.apply(lambda row: get_count_type(row, ovarian_sig_genes_up, ovarian_sig_genes_down), axis=1)

In [37]:
ccrcc_counts_t = ccrcc_counts.transpose()
ccrcc_counts_t['CCRCC'] = ccrcc_counts_t.apply(lambda row: get_count_type(row, ccrcc_sig_genes_up, ccrcc_sig_genes_down), axis=1)

In [39]:
filter1 = ccrcc_counts_t.transpose().columns.get_level_values(level=0).isin(ccrcc_sig_genes_up.index.get_level_values(level=0))
ccrcc_filtered_counts = ccrcc_counts_t.transpose().loc[:, filter1]
filter2 = ccrcc_counts_t.transpose().columns.get_level_values(level=0).isin(ccrcc_sig_genes_down.index.get_level_values(level=0))

ccrcc_filtered_counts_down = ccrcc_counts_t.transpose().loc[:, filter2]
ccrcc_filtered_counts = ccrcc_filtered_counts.transpose().append(ccrcc_filtered_counts_down.transpose()).transpose()

                      AASS             ABCA13              ABCB1  \
        ENSG00000008311.14 ENSG00000179869.14 ENSG00000085563.14   
high                    15                 14                 16   
neutral                 93                 94                 92   
low                      2                  2                  2   
average          0.0973339           0.102518            0.10849   
CCRCC                 high               high               high   

                     ABCB4              ABCB5              ABCB8  \
        ENSG00000005471.15 ENSG00000004846.16 ENSG00000197150.12   
high                    16                 14                 15   
neutral                 92                 94                 92   
low                      2                  2                  3   
average            0.10849           0.102253          0.0932567   
CCRCC                 high               high               high   

                    ABCF2             ABHD11  

In [40]:
ccrcc_filtered_counts_index = ccrcc_filtered_counts.columns.droplevel(level=1)
ccrcc_filtered_counts.columns = ccrcc_filtered_counts_index
ccrcc_filtered_counts_t = ccrcc_filtered_counts.transpose()
ccrcc_filtered_counts_t = ccrcc_filtered_counts_t.loc[~ccrcc_filtered_counts_t.index.duplicated(keep='first')]
print(ccrcc_filtered_counts_t)

       high neutral low    average CCRCC
AASS     15      93   2  0.0973339  high
ABCA13   14      94   2   0.102518  high
ABCB1    16      92   2    0.10849  high
ABCB4    16      92   2    0.10849  high
ABCB5    14      94   2   0.102253  high
...     ...     ...  ..        ...   ...
ZNF80     0      85  25 -0.0619169   low
ZNF839    0      72  38  -0.133027   low
ZNF852    0      22  88   -0.32306   low
ZNF860    0      22  88  -0.322831   low
ZPLD1     0      82  28 -0.0823269   low

[3054 rows x 5 columns]


Set up Dataset

In [42]:
data = pd.DataFrame(data=luad_counts_t['LUAD'])
data['BRCA'] = brca_filtered_counts_t['BRCA']
data['HNSCC'] = hnscc_counts_t['HNSCC']
data['OVARIAN'] = ovarian_counts_t['OVARIAN']
data['ENDO'] = en_counts_t['ENDO']
data['GBM'] = gbm_counts_t['GBM']
data['CCRCC'] = ccrcc_filtered_counts_t['CCRCC']
data.fillna("normal", inplace=True)

In [44]:
data.transpose().to_csv('all_data.csv')

In [61]:
filter1 = data.transpose().columns.get_level_values("Name").isin(sig_genes_up)
filter2 = data.transpose().columns.get_level_values("Name").isin(sig_genes_down)
data_up = data.transpose().loc[:, filter1]
data_down = data.transpose().loc[:, filter2]
data_down_and_up = data_up.transpose().append(data_down.transpose())
data_down_and_up = data_down_and_up.join(uniprot_chromosomes, how='inner')
print(data_down_and_up)

         LUAD    BRCA   HNSCC OVARIAN    ENDO     GBM   CCRCC     chromosome
AARD     high    high    high    high    high  normal  normal   Chromosome 8
ABHD16B  high  normal  normal  normal    high    high  normal  Chromosome 20
ABRA     high    high    high    high  normal  normal  normal   Chromosome 4
ADCY8    high    high    high    high    high  normal  normal   Chromosome 8
ADRM1    high  normal  normal  normal    high    high  normal  Chromosome 20
...       ...     ...     ...     ...     ...     ...     ...            ...
USP9Y     low  normal     low  normal    high    high     low   Chromosome Y
UTY       low  normal     low  normal    high    high     low   Chromosome Y
VCY       low  normal     low  normal    high    high     low  Chromosome 19
VCY1B     low  normal     low  normal    high    high     low   Chromosome Y
ZFY       low  normal     low  normal    high  normal     low   Chromosome Y

[396 rows x 8 columns]


In [81]:
data_down_and_up.to_csv('up_or_down.csv')