In [None]:
import sqlite3
from pandas import DataFrame
from numpy import log10
from numpy import inf
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV
import rfpimp
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import ListedColormap
db = sqlite3.connect('./finalDupCompDatabase')
cursor = db.cursor()

In [None]:
# fetch and clean data: WGD vs SSD model
cursor.execute('''SELECT 
                    genLen, 
                    cdsLen,
                    evolRate, 
                    u_domains,
                    gc3,
                    domains,
                    specificity,
                    transCount,
                    intCount,
                    intAvg,
                    max_exp,
                    motif_number_1k,
                    intCov,
                    mis_Z_score,
                    loftool_percentile,
                    s_het,
                    Phi,
                    RVIS,
                    dupCat_maj
                FROM
                    gene_features
                WHERE (dupCat_maj == "WGD" OR dupCat_maj == "SSD")
                 AND RVIS != "NA"''')

data = cursor.fetchall()
dfBoth = DataFrame(data).dropna()
dfBoth.columns = ['Genomic length','CDS len','Evolution rate','Unique domains','GC3%',
              'Domains','Specificity','Isoform count','Intron count','Mean intron length',
              'Maximal expression','Regulatory motif count',
              'Intron coverage%','Missense Z score','LoFtool percentile','S$_{het}$','Phi','RVIS','Duplication type']
dfBoth['Duplication type'].value_counts()

In [None]:
#feature interdependency
mat = rfpimp.feature_dependence_matrix(dfBoth.iloc[:,0:18])
viz2 = rfpimp.plot_dependence_heatmap(mat,figsize=[15,10],label_fontsize=13,value_fontsize=12,threshold=-1)
viz2.save('rfdep_WGD_SSD_current.svg')
viz2.view()

In [None]:
# test/train splitting
dfBothBinary = dfBoth.replace('WGD',1)
dfBothBinary = dfBothBinary.replace('SSD',0)
train_x, test_x, train_y, test_y = train_test_split(dfBothBinary[dfBothBinary.columns[:18]], dfBothBinary[dfBothBinary.columns[18]], train_size = 0.8)

train_X, train_Y = train_x,train_y
# no optimisation fit and accuracy check
rf = RandomForestClassifier(class_weight='balanced')
rf.fit(train_X,train_Y)
print(train_Y.value_counts())
tn, fp, fn, tp = confusion_matrix(test_y, rf.predict(test_x)).ravel()
print(tn,fp,fn,tp)
print('Overall accuracy:', accuracy_score(test_y,rf.predict(test_x)))
print('F1 score: ', f1_score(test_y,rf.predict(test_x)))
print('')
print('WGD recall:', tp/(tp+fn))
print('WGD precision:', tp/(tp+fp))
print('')
print('SSD recall:', tn/(tn+fp))
print('SSD precision:', tn/(tn+fn))

# hyperparameter selection - initial broad search followed by fine tuning
# broad search
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 1, stop = 50, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 10, verbose=2,n_jobs = -1,scoring='f1')
# Fit the random search model
rf_random.fit(train_X, train_Y)
print(rf_random.best_params_)

In [None]:
# after search
from sklearn.metrics import f1_score
rfBestBoth = rf_random.best_estimator_
rfBestBoth.fit(train_X,train_Y)

tn, fp, fn, tp = confusion_matrix(test_y, rfBestBoth.predict(test_x)).ravel()
print(tn,fp,fn,tp)
print('Overall accuracy:', accuracy_score(test_y,rfBestBoth.predict(test_x)))
print('F1 score: ',f1_score(test_y,rfBestBoth.predict(test_x)))
print('')
print('WGD recall:', tp/(tp+fn))
print('WGD precision:', tp/(tp+fp))
print('')
print('SSD recall:', tn/(tn+fp))
print('SSD precision:', tn/(tn+fn))
origCmap = cm.get_cmap('coolwarm', 512)
newCmap = ListedColormap(origCmap(np.linspace(0.6, 0.85, 256)))
matVis = plot_confusion_matrix(rfBestBoth, test_x, test_y,
                                 display_labels=['SSD','WGD'],
                                 cmap=newCmap,normalize='pred')
#with normalise = 'pred', matVis.text_ has the structure [[topLeft,bottomLeft],[topRight,bottomRight]]
for subList in matVis.text_:
    for text in subList:
        text.set_color('k')
        text.set_fontsize(14)
matVis.ax_.set_ylabel('True label',size=14)
matVis.ax_.set_xlabel('Predicted label',size=14)
matVis.ax_.xaxis.set_tick_params(labelsize=12)
matVis.ax_.yaxis.set_tick_params(labelsize=12)
# matVis.
# disp.ax_.set_title(title)
plt.savefig('finalSSDvsWGD_confusion_matrix.svg',bbox_inches='tight')
plt.show()

In [None]:
# check feature importances for selected hyperparameters:
feats = list(dfBothBinary.columns[:18])
feats.append('Length and intron count')
rankingDict = {}
for feature in feats:
    rankingDict[feature] = []
impDict = {}
for feature in feats:
    impDict[feature] = []
rankDict = {}
for feature in feats:
    rankDict[feature] = []
for i in range(100):
#     sampler = RandomUnderSampler()
    train_x, test_x, train_y, test_y = train_test_split(dfBothBinary[dfBothBinary.columns[:18]], dfBothBinary[dfBothBinary.columns[18]], train_size = 0.8)
    train_X, train_Y  = train_x,train_y

    rfBestBoth.fit(train_X,train_Y)
     
    imps = list(rfpimp.importances(rfBestBoth,test_x,test_y).itertuples(name=None))
    imps.append(('Length and intron count', sum([x[1] for x in imps if x[0] == 'Genomic length' or x[0] == 'CDS Len' or x[0] == 'Intron count'])))
        
    imps = sorted(imps,key= lambda x: x[1],reverse=True)
    
   
    for feature in feats:
        rank = [x[0] for x in imps].index(feature) + 1
        rankingDict[feature].append(rank)
        impDict[feature].append([x[1] for x in imps if x[0] == feature][0])

for f in rankingDict:
    rankDict[f].append(np.mean(rankingDict[f]))
    rankDict[f].append(np.std(rankingDict[f]))
print(rankDict)
print('')
for f in impDict:
    print(f, np.mean(impDict[f]))

In [None]:
fig, ax = plt.subplots(figsize=(7,10))
plotList = sorted([x for x in rankDict],key=lambda x: rankDict[x][0])
ax.yaxis.set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.set_xlabel('Mean importance rank\n(100 iterations)', fontsize=16)
# ax.set_ylim(ymax=12)
# ax.set_xticks(np.arange(1,11))
ax.tick_params(labelsize=16)
for i,f in enumerate(plotList):
    vPos = 30-i
    point = rankDict[f][0]
    sd = rankDict[f][1]
    
    ax.plot(rankDict[f][0],vPos,'ko',)
    ax.text(-15,vPos-0.1,f, fontsize=16)
    ax.plot([point-sd,point+sd],[vPos,vPos],'k-')
    
    ax.plot([point-sd,point-sd],[vPos-0.2,vPos+0.2],'k-')
    ax.plot([point+sd,point+sd],[vPos-0.2,vPos+0.2],'k-')
plt.savefig('WGDvsSSD_rf_imp_plot_current.svg',bbox_inches='tight')
plt.show()