# MUTATION CONSEQUENCES COUNTS: ALL
           sample1 sample2 sample3 ... sample19
HOM_00_SYN
HET_01_SYN
HOM_11_SYN
HOM_00_NONSYN
HET_01_NONSYN
HOM_11_NONSYN
HOM_00_LOF
HET_01_LOF
HOM_11_LOF

# Import Packages

In [1]:
import pandas as pd
import requests
import numpy as np
import sys

# Import Data

In [2]:
#Import SIFT Predictions
Seto_SiteCons = pd.read_csv('/Users/abc6435/Desktop/WROH/data/SiteCons_ALL.csv')

In [3]:
#Remove Unwanted column
list(Seto_SiteCons.columns)
Seto_SiteCons.pop('Unnamed: 0')

0              186
1              187
2              191
3              192
4              193
            ...   
549792    50699012
549793    50699013
549794    50699014
549795    50699015
549796    50699016
Name: Unnamed: 0, Length: 549797, dtype: int64

In [4]:
#Unique Mutation Consequence
Seto_SiteCons['MUTATION_CONSEQUENCE'].unique()

array(['NONSYNONYMOUS', 'SYNONYMOUS', 'STOP-GAIN', 'STOP-LOSS',
       'START-LOST'], dtype=object)

# Subset Mutation Consequence Type

In [5]:
#Subset Nonsynonymous sites
NONSYN = Seto_SiteCons.loc[Seto_SiteCons['MUTATION_CONSEQUENCE'] == 'NONSYNONYMOUS']
SYN = Seto_SiteCons.loc[Seto_SiteCons['MUTATION_CONSEQUENCE'] == 'SYNONYMOUS']
STOPGAIN = Seto_SiteCons.loc[Seto_SiteCons['MUTATION_CONSEQUENCE'] == 'STOP-GAIN']
STOPLOSS = Seto_SiteCons.loc[Seto_SiteCons['MUTATION_CONSEQUENCE'] == 'STOP-LOSS']
STARTLOST = Seto_SiteCons.loc[Seto_SiteCons['MUTATION_CONSEQUENCE'] == 'START-LOST']

In [6]:
STARTLOST

Unnamed: 0,CHROM,POS,REF,ALT,262,2871,TE22T01,TE30T02,SE25T02,183195332,...,183195312,163,1049,1940,4056,284029323,TF19T04,TF03T03,SiteIDS,MUTATION_CONSEQUENCE
1868,chr1,2909587,C,T,0/0,./.,0/0,0/1,0/0,0/0,...,0/0,./.,0/0,0/0,0/0,0/0,./.,0/0,chr1-2909587-C-T,START-LOST
2438,chr1,4229352,A,G,0/0,0/0,0/1,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,chr1-4229352-A-G,START-LOST
3684,chr1,5234804,T,C,0/0,0/0,0/0,0/0,0/0,0/0,...,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,chr1-5234804-T-C,START-LOST
3752,chr1,5413709,A,G,0/0,0/0,0/0,0/0,0/1,0/0,...,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,chr1-5413709-A-G,START-LOST
3753,chr1,5413710,T,C,0/0,0/0,0/0,0/0,0/0,0/1,...,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,chr1-5413710-T-C,START-LOST
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547473,chr28,3727855,G,T,0/0,0/0,0/0,0/0,0/0,0/0,...,0/0,./.,0/0,0/1,0/0,0/0,0/0,0/0,chr28-3727855-G-T,START-LOST
547675,chr28,3878736,T,C,0/0,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,chr28-3878736-T-C,START-LOST
548435,chr28,4848752,T,C,1/1,1/1,1/1,1/1,1/1,1/1,...,1/1,1/1,1/1,./.,1/1,1/1,0/1,1/1,chr28-4848752-T-C,START-LOST
548436,chr28,4848753,G,A,1/1,1/1,1/1,1/1,1/1,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,chr28-4848753-G-A,START-LOST


## Create an Empty Dataframe For Counts

In [7]:
#Make A 13X19 empty dataframe to keep track of Genotype Counts
cons_counts = pd.DataFrame(index=range(16),columns=range(19))

#Set all cells equal to zero
for col in cons_counts.columns:
    cons_counts[col].values[:] = 0

#Set the Column names equal to the DEL column names
cons_counts = cons_counts.set_axis(list(Seto_SiteCons.columns[4:23]), axis=1)

#Add GENOTYPE column and set it as the first column
cons_counts['TYPE'] = ''
new_cols = ['TYPE','262','2871','TE22T01','TE30T02','SE25T02','183195332','183194861',
            '183195321','183195304','183194841','183195326','183195312','163','1049','1940','4056','284029323',
            'TF19T04','TF03T03']
cons_counts = cons_counts[new_cols]

#Name Rows by Genotype_NONSYN
cons_counts.at[0,'TYPE'] = 'HOM_00_NONSYN'
cons_counts.at[1,'TYPE'] = 'HET_01_NONSYN'
cons_counts.at[2,'TYPE'] = 'HOM_11_NONSYN'

#Name Rows by Genotype_SYN
cons_counts.at[3,'TYPE'] = 'HOM_00_SYN'
cons_counts.at[4,'TYPE'] = 'HET_01_SYN'
cons_counts.at[5,'TYPE'] = 'HOM_11_SYN'

#Name Rows by Genotype_STOPGAIN
cons_counts.at[6,'TYPE'] = 'HOM_00_STOPGAIN'
cons_counts.at[7,'TYPE'] = 'HET_01_STOPGAIN'
cons_counts.at[8,'TYPE'] = 'HOM_11_STOPGAIN'

#Name Rows by Genotype_STOPLOSS
cons_counts.at[9,'TYPE'] = 'HOM_00_STOPLOSS'
cons_counts.at[10,'TYPE'] = 'HET_01_STOPLOSS'
cons_counts.at[11,'TYPE'] = 'HOM_11_STOPLOSS'

#Name Rows by Genotype_STOPLOSS
cons_counts.at[12,'TYPE'] = 'HOM_00_STARTLOST'
cons_counts.at[13,'TYPE'] = 'HET_01_STARTLOST'
cons_counts.at[14,'TYPE'] = 'HOM_11_STARTLOST'

#Set last row to missing
cons_counts.at[15,'TYPE'] = 'MISSING'

## Loop Counts NONSYNONYMOUS 

In [8]:
#Loop over all NONSYN Consequences and tally all 0/0, 0/1 | 1/0, and 1/1 genotypes 
for column in NONSYN.columns[4:23]:
    for row in range(len(NONSYN)):
        if NONSYN.iloc[row][column] == '0/0':
            cons_counts.loc[0,column] += 1
        if NONSYN.iloc[row][column] == '0/1':
            cons_counts.loc[1,column] += 1
        if NONSYN.iloc[row][column] == '1/1':
            cons_counts.loc[2,column] += 1
        if NONSYN.iloc[row][column] == './.':
            cons_counts.loc[15,column] += 1
        continue

#Loop over all SYN Consequences and tally all 0/0, 0/1 | 1/0, and 1/1 genotypes 
for column in SYN.columns[4:23]:
    for row in range(len(SYN)):
        if SYN.iloc[row][column] == '0/0':
            cons_counts.loc[3,column] += 1
        if SYN.iloc[row][column] == '0/1':
            cons_counts.loc[4,column] += 1
        if SYN.iloc[row][column] == '1/1':
            cons_counts.loc[5,column] += 1
        if SYN.iloc[row][column] == './.':
            cons_counts.loc[15,column] += 1
        continue
        
#Loop over all STOPGAIN Consequences and tally all 0/0, 0/1 | 1/0, and 1/1 genotypes 
for column in STOPGAIN.columns[4:23]:
    for row in range(len(STOPGAIN)):
        if STOPGAIN.iloc[row][column] == '0/0':
            cons_counts.loc[6,column] += 1
        if STOPGAIN.iloc[row][column] == '0/1':
            cons_counts.loc[7,column] += 1
        if STOPGAIN.iloc[row][column] == '1/1':
            cons_counts.loc[8,column] += 1
        if STOPGAIN.iloc[row][column] == './.':
            cons_counts.loc[15,column] += 1
        continue 

#Loop over all STOPLOSS Consequences and tally all 0/0, 0/1 | 1/0, and 1/1 genotypes 
for column in STOPLOSS.columns[4:23]:
    for row in range(len(STOPLOSS)):
        if STOPLOSS.iloc[row][column] == '0/0':
            cons_counts.loc[9,column] += 1
        if STOPLOSS.iloc[row][column] == '0/1':
            cons_counts.loc[10,column] += 1
        if STOPLOSS.iloc[row][column] == '1/1':
            cons_counts.loc[11,column] += 1
        if STOPLOSS.iloc[row][column] == './.':
            cons_counts.loc[15,column] += 1
        continue
        
#Loop over all STARTLOST Consequences and tally all 0/0, 0/1 | 1/0, and 1/1 genotypes 
for column in STARTLOST.columns[4:23]:
    for row in range(len(STARTLOST)):
        if STARTLOST.iloc[row][column] == '0/0':
            cons_counts.loc[12,column] += 1
        if STARTLOST.iloc[row][column] == '0/1':
            cons_counts.loc[13,column] += 1
        if STARTLOST.iloc[row][column] == '1/1':
            cons_counts.loc[14,column] += 1
        if STARTLOST.iloc[row][column] == './.':
            cons_counts.loc[15,column] += 1
        continue

cons_counts

Unnamed: 0,TYPE,262,2871,TE22T01,TE30T02,SE25T02,183195332,183194861,183195321,183195304,183194841,183195326,183195312,163,1049,1940,4056,284029323,TF19T04,TF03T03
0,HOM_00_NONSYN,167063,156254,167851,167011,167408,163213,165799,164777,160733,162019,164088,167708,161003,164743,165467,163661,161599,166559,167247
1,HET_01_NONSYN,19065,17852,19302,19249,19304,7861,11661,11778,10902,11473,11691,11849,20969,21338,21111,21268,20846,21201,21613
2,HOM_11_NONSYN,16179,15366,16270,16154,15944,25974,24450,24439,24210,24220,24493,24743,14746,14815,14906,14705,14557,14928,14947
3,HOM_00_SYN,265809,249612,267938,266561,266753,261193,264771,263710,257259,259174,262079,268160,256618,261081,262563,260280,256565,264330,265635
4,HET_01_SYN,35433,33582,35402,35180,35576,13147,19105,18751,17840,18466,19040,19029,38933,39938,39763,39513,38623,39485,40262
5,HOM_11_SYN,28233,26829,27988,28044,28044,46499,44490,44475,43947,43788,44317,44834,25634,26043,26056,25689,25610,26147,25945
6,HOM_00_STOPGAIN,3266,3005,3293,3319,3293,3182,3272,3230,3173,3197,3205,3299,3168,3198,3259,3191,3164,3252,3287
7,HET_01_STOPGAIN,403,369,411,377,390,154,215,227,198,210,242,228,411,436,436,460,436,462,434
8,HOM_11_STOPGAIN,287,294,286,280,289,501,485,491,470,480,467,491,277,282,262,269,276,260,267
9,HOM_00_STOPLOSS,212,199,219,209,217,219,220,218,211,220,214,220,193,200,202,203,200,199,207


# Save

In [9]:
#Save as a CSV
cons_counts.to_csv('/Users/abc6435/Desktop/WROH/data/consequence_counts.csv')