In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
import math

In [None]:
pwd

# Global

In [None]:
raw = pd.read_csv('../raw-data/hetmat_human200_179sp.txt', sep = '\t')
cols = []
for i in range(len(raw)):
    cols.append(str(raw.loc[i][0]) + ' ' + str(raw.loc[i][1]) + ' ' + str(raw.loc[i][2]))

In [None]:
iucn = pd.read_csv("raw-data/zoonomia_dataset_17nov2021_streamlined.csv", index_col = 0)

In [None]:
iucn

In [None]:
iucn = iucn[['IUCN']]

In [None]:
iucn

In [None]:
o = pd.read_csv('raw-data/Coding_Vars_and_phyloP_240sp.txt', sep='\t', index_col=0)
o = o[['coding_vars']]

In [None]:
o

# Functions

In [None]:
'''

Given a raw dataset and a name, accomplish the following

(1) transpose
(2) fix column names
(3) plot + remove NaNs species
(4) fix row names

'''

def read(dataset, name):
    dataset = dataset.T
    try:
        dataset.drop(['chr', 'start', 'stop'], axis=0, inplace = True)
    except:
        pass
    dataset.columns = cols
    
    nanperrow = (dataset.isnull().sum(axis=1).tolist())
    indices = [i for i, x in enumerate(nanperrow) if (x > 30000)]
    dataset.drop(dataset.index[indices], inplace=True)
    
    dataset.index = [i.split('.')[0] for i in dataset.index]
    
    return dataset

In [None]:
'''

Given a dataset, normalize with respect to the the total number of coding variants

'''

def normalize(dataset):
    
    dataset = dataset.drop(['Capra_hircus'], axis=0)
    
    tmp = dataset.T
    for i in tmp.columns:
        tmp[i] /= int(o.loc[i].tolist()[0])
        
    return dataset

In [None]:
'''

Given a list of datasets, merge them into one dataset

'''

def merge(datasets, common):
    
    df = pd.concat([dataset.loc[common] for dataset in datasets], axis=1)
    return df

In [None]:
'''

Given a dataset, add IUCN labels

'''

def addiucn(dataset):

    subset = iucn.loc[dataset.index]
                
    dataset['iucn'] = subset
    
    return dataset

In [None]:
'''

Given a dataset and a name, segregate the non-DD and DD species into two files

'''

def split(df, name):
    
    ndf = df[df['iucn'] != 'DD']
    ddf = df[df['iucn'] == 'DD']
    
    ndf.to_csv('data/' + name + '.csv')
    ddf.to_csv('dd-data/' + name + '-dd' + '.csv')

# Analysis

In [None]:
het = pd.read_csv('raw-data/hetmat_human200_207sp.txt', sep = '\t')
roh = pd.read_csv('raw-data/rohmat_human200_207sp.txt', sep = '\t')
snpphylop = pd.read_csv('raw-data/snpphylopmat_human50KB_240sp.txt', sep = '\t')
miscons = pd.read_csv('raw-data/missense_conserved_human50kb_240sp.txt', sep = '\t')
miscount = pd.read_csv('raw-data/missense_counts_human50kb_240sp.txt', sep = '\t')

In [None]:
het = read(het, 'het')
roh = read(roh, 'roh')
snpphylop = read(snpphylop, 'snpphylop')
miscons = read(miscons, 'miscons')
miscount = read(miscount, 'miscount')

In [None]:
common171 = snpphylop.index.tolist()

In [None]:
len(common171)

In [None]:
het = het.loc[common171]
roh = roh.loc[common171]
snpphylop = snpphylop.loc[common171]
miscons = miscons.loc[common171]
miscount = miscount.loc[common171]

In [None]:
miscons = normalize(miscons)
miscount = normalize(miscount)

In [None]:
common170 = miscons.index.tolist()

In [None]:
len(common170)

In [None]:
three = merge([het, roh, snpphylop], common171)
five = merge([het, roh, snpphylop, miscons, miscount], common170)

In [None]:
five

In [None]:
cols = three.columns.tolist()
for i in range(0, 57509):
    cols[i] = cols[i] + '-het'
for i in range(57509, 57509*2):
    cols[i] = cols[i] + '-roh'
for i in range(57509*2, 57509*3):
    cols[i] = cols[i] + '-snpphylop'
three.columns = cols

In [None]:
cols = five.columns.tolist()
for i in range(0, 57509):
    cols[i] = cols[i] + '-het'
for i in range(57509, 57509*2):
    cols[i] = cols[i] + '-roh'
for i in range(57509*2, 57509*3):
    cols[i] = cols[i] + '-snpphylop'
for i in range(57509*3, 57509*4):
    cols[i] = cols[i] + '-miscons'
for i in range(57509*4, 57509*5):
    cols[i] = cols[i] + '-miscount'
five.columns = cols

In [None]:
het = addiucn(het)
roh = addiucn(roh)
snpphylop = addiucn(snpphylop)
miscons = addiucn(miscons)
miscount = addiucn(miscount)
three = addiucn(three)
five = addiucn(five)

In [None]:
df = pd.read_csv('data/full-summary.csv', index_col = [0])

In [None]:
df

In [None]:
df = df.drop(['iucn'], axis=1)

In [None]:
df

In [None]:
het1 = merge([het, df], common171)
roh1 = merge([roh, df], common171)
snpphylop1 = merge([snpphylop, df], common171)
miscons1 = merge([miscons, df], common170)
miscount1 = merge([miscount, df], common170)
three1 = merge([three, df], common171)
five1 = merge([five, df], common170)

In [None]:
split(het, 'het-200')
split(roh, 'roh-200')
split(snpphylop, 'snpphylop-200')
split(miscons, 'miscons-200')
split(miscount, 'miscount-200')
split(three, 'three-200')
split(five, 'five-200')

In [None]:
split(het1, 'het-summary-200')
split(roh1, 'roh-summary-200')
split(snpphylop1, 'snpphylop-summary-200')
split(miscons1, 'miscons-summary-200')
split(miscount1, 'miscount-summary-200')
split(three1, 'three-summary-200')
split(five1, 'five-summary-200')

In [None]:
raw.columns.tolist()

In [None]:
len(raw.columns)

In [None]:
tmp = pd.read_csv('data/miscons-summary.csv')

In [None]:
tmp

In [None]:
tmp.index = tmp['Unnamed: 0']

In [None]:
set(tmp['iucn'])