In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools as itls
import scipy.stats as stats
import sklearn.utils
import random
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
def collector(name, org, path):
    titles = ['Query', 'SwissProt', 'Score']
    
    collection = []
    collection2 = []    
    
    c_45 = pd.read_csv(path + name + '/' + org + '_close45_2.csv',',',names=titles)
    collection.append(c_45)
    
    d_45 = pd.read_csv(path + name + '/' + org + '_distal45_2.csv',',',names=titles)
    collection2.append(d_45)
    
    c_50 = pd.read_csv(path + name + '/' + org + '_close50_2.csv',',',names=titles)
    collection.append(c_50)
    
    d_50 = pd.read_csv(path + name + '/' + org + '_distal50_2.csv',',',names=titles)
    collection2.append(d_50)
    
    c_62 = pd.read_csv(path + name + '/' + org + '_close62_2.csv',',',names=titles)
    collection.append(c_62)
    
    d_62 = pd.read_csv(path + name + '/' + org + '_distal62_2.csv',',',names=titles)
    collection2.append(d_62)
    
    c_80 = pd.read_csv(path + name + '/' + org + '_close80_2.csv',',',names=titles)
    collection.append(c_80)
    
    d_80 = pd.read_csv(path + name + '/' + org + '_distal80_2.csv',',',names=titles)
    collection2.append(d_80)
    
    c_90 = pd.read_csv(path + name + '/' + org + '_close90_2.csv',',',names=titles)
    collection.append(c_90)
    
    d_90 = pd.read_csv(path + name + '/' + org + '_distal90_2.csv',',',names=titles)
    collection2.append(d_90)
    
    return collection, collection2

In [3]:
c_rfelis, d_rfelis = collector('Rfelis','rf', '/home/cg2721/Data Analysis/')
c_ecoli, d_ecoli = collector('Ecoli','ec', '/home/cg2721/Data Analysis/')
c_mtb, d_mtb = collector('Mtb','mt', '/home/cg2721/Data Analysis/')

In [4]:
#Find the differences between the close and distal scores for each organism
def differences(close, distal):
    diffs = []
    
    #Evaluate for BLOSUMX 
    for x in range(5):
        cx = close[x]
        dx = distal[x]
        cx75 = cx.quantile(.75).max() #Close max score
        dx75 = dx.quantile(.75).max() #Distal max score
        
        top75_c = [] 
        top75_d = []
        cxs = cx['Score'].tolist()
        dxs = dx['Score'].tolist()
        
        for s in cxs:
            if s <= cx75:
                top75_c.append(s)
        for s in dxs:
            if s <= dx75:
                top75_d.append(s)
        
        #Make a new list of close scores subsampled to be the same size as the distal
        temp = []
        sklearn.utils.shuffle(top75_c)
        random.seed()
        m = len(top75_d)
        for i in range(m):
            index = random.randrange(len(top75_c))
            temp.append(top75_c[index])
        top75_c = temp
        
        diffx = [top75_c[i] - top75_d[i] for i in range(m)]
        
        diffs.append(diffx)
    
    return diffs

In [5]:
rf_diffs = differences(c_rfelis, d_rfelis)
ec_diffs = differences(c_ecoli, d_ecoli)
mt_diffs = differences(c_mtb, d_mtb)

In [6]:
# splitting training & testing

c_rfelis_train = []
c_rfelis_test = []
c_ecoli_train = []
c_ecoli_test = []
c_mtb_train = []
c_mtb_test = []

d_rfelis_train = []
d_rfelis_test = []
d_ecoli_train = []
d_ecoli_test = []
d_mtb_train = []
d_mtb_test = []

for i in range(5):
    c_train, c_test = train_test_split(c_rfelis[i], test_size=.3)
    c_rfelis_train.append(c_train)
    c_rfelis_test.append(c_test)
    
    d_train, d_test = train_test_split(d_rfelis[i], test_size=.3)
    d_rfelis_train.append(d_train)
    d_rfelis_test.append(d_test)
    
    c_train, c_test = train_test_split(c_ecoli[i], test_size=.3)
    c_ecoli_train.append(c_train)
    c_ecoli_test.append(c_test)
    
    d_train, d_test = train_test_split(d_ecoli[i], test_size=.3)
    d_ecoli_train.append(d_train)
    d_ecoli_test.append(d_test)
    
    c_train, c_test = train_test_split(c_mtb[i], test_size=.3)
    c_mtb_train.append(c_train)
    c_mtb_test.append(c_test)
    
    d_train, d_test = train_test_split(d_mtb[i], test_size=.3)
    d_mtb_train.append(d_train)
    d_mtb_test.append(d_test)

In [7]:
#Perform kruskal analysis on the training data
#st, p = stats.kruskal(rf_diffs[3], ec_diffs[3])
#print (p)

In [8]:
rfc_avgs_train = [np.mean(c_rfelis_train[i]['Score'].tolist()) for i in range(5)]
rfd_avgs_train = [np.mean(d_rfelis_train[i]['Score'].tolist()) for i in range(5)]
ecc_avgs_train = [np.mean(c_ecoli_train[i]['Score'].tolist()) for i in range(5)]
ecd_avgs_train = [np.mean(d_ecoli_train[i]['Score'].tolist()) for i in range(5)]
mtc_avgs_train = [np.mean(c_mtb_train[i]['Score'].tolist()) for i in range(5)]
mtd_avgs_train = [np.mean(d_mtb_train[i]['Score'].tolist()) for i in range(5)]

In [9]:
rf_diff_avgs_train = [rfc_avgs_train[i] - rfd_avgs_train[i] for i in range(5)]
ec_diff_avgs_train = [ecc_avgs_train[i] - ecd_avgs_train[i] for i in range(5)]
mt_diff_avgs_train = [mtc_avgs_train[i] - mtd_avgs_train[i] for i in range(5)]

In [10]:
print('R.felis differences: {}'.format(rf_diff_avgs_train))
print('E.coli differences: {}'.format(ec_diff_avgs_train))
print('M.tb differences: {}'.format(mt_diff_avgs_train))

R.felis differences: [220.18957977163058, 216.58931576811275, 223.98647182423838, 236.06529331473095, 232.92129620044193]
E.coli differences: [289.6026640198411, 285.50371126301786, 291.6987378954384, 305.39607231871975, 303.9524190750079]
M.tb differences: [139.5519753622341, 139.47384859645257, 139.84970992076882, 146.96835089299117, 144.61688210532097]


In [11]:
# Testing data (30%)

In [12]:
rfc_avgs_test = [np.mean(c_rfelis_test[i]['Score'].tolist()) for i in range(5)]
rfd_avgs_test = [np.mean(d_rfelis_test[i]['Score'].tolist()) for i in range(5)]
ecc_avgs_test = [np.mean(c_ecoli_test[i]['Score'].tolist()) for i in range(5)]
ecd_avgs_test = [np.mean(d_ecoli_test[i]['Score'].tolist()) for i in range(5)]
mtc_avgs_test = [np.mean(c_mtb_test[i]['Score'].tolist()) for i in range(5)]
mtd_avgs_test = [np.mean(d_mtb_test[i]['Score'].tolist()) for i in range(5)]

In [13]:
rf_diff_avgs_test = [rfc_avgs_test[i] - rfd_avgs_test[i] for i in range(5)]
ec_diff_avgs_test = [ecc_avgs_test[i] - ecd_avgs_test[i] for i in range(5)]
mt_diff_avgs_test = [mtc_avgs_test[i] - mtd_avgs_test[i] for i in range(5)]

In [15]:
print('R.felis differences: {}'.format(rf_diff_avgs_test))
print('E.coli differences: {}'.format(ec_diff_avgs_test))
print('M.tb differences: {}'.format(mt_diff_avgs_test))

R.felis differences: [223.7528695147949, 217.7568750883488, 223.94798363328806, 231.38334067760826, 236.67798350094728]
E.coli differences: [291.8143529783455, 284.52059647568205, 292.9379522152405, 303.94054610964224, 308.96918038380306]
M.tb differences: [138.8576259955364, 138.4337656337375, 141.6933251095912, 146.72007552420624, 147.2762200169402]
