In [1]:
import warnings; warnings.simplefilter("ignore")
#importing important libraries
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.formula.api as  sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

In [2]:
# datafile upload and data preparation
df = pd.read_csv("diffreport3.csv", sep= ",")

d1 = df.drop("name", axis = 1)
d2 = d1.drop("isotopes", axis = 1)
d3 = d2.drop("adduct", axis = 1)
d4 = d3.drop("tstat", axis = 1)
d5 = d4.drop("pvalue", axis = 1)
d6 = d5.drop("fold", axis = 1)
d7 = d6.drop(d6.columns[0], axis = 1)
d8 = d7.drop("npeaks", axis = 1)
d9 = d8.drop("Eta6", axis = 1)
d10 = d9.drop("Eta8", axis = 1)
columns = ['Eta6_0', 'Eta6_2', 'Eta6_3', 'Eta8.1', 'Eta82', 'Eta83', 'Seq_ID']
df1 = pd.DataFrame(d10, columns = columns)

In [17]:
#making a smaller dataset for test purposes
df1 = df1[0:200]

In [18]:
# creation of train and testing sets

def get_train_test(df, y_col, x_cols, ratio):

    mask = np.random.rand(len(df)) > ratio
    df_train = df[mask]
    df_test = df[~mask]
       
    Y_train = df_train[y_col].values
    Y_test = df_test[y_col].values
    X_train = df_train[x_cols].values
    X_test = df_test[x_cols].values
    return df_train, df_test, X_train, Y_train, X_test, Y_test
 
y_col = 'Seq_ID'
x_cols = list(df1.columns.values)
x_cols.remove(y_col)
 
train_test_ratio = 0.7
df_train, df_test, X_train, Y_train, X_test, Y_test = get_train_test(df1, y_col, x_cols, train_test_ratio)

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [20]:
dict_classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Nearest Neighbors": KNeighborsClassifier(),
    "Linear SVM": SVC(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(n_estimators=1000),
    "Decision Tree": tree.DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=1000),
    "Neural Net": MLPClassifier(alpha = 1),
    "Naive Bayes": GaussianNB(),
    #"AdaBoost": AdaBoostClassifier(),
    #"QDA": QuadraticDiscriminantAnalysis(),
    #"Gaussian Process": GaussianProcessClassifier()
}

In [21]:
import time
def batch_classify(X_train, Y_train, X_test, Y_test, no_classifiers = 5, verbose = True):
    
    dict_models = {}
    for classifier_name, classifier in list(dict_classifiers.items())[:no_classifiers]:
        t_start = time.clock()
        classifier.fit(X_train, Y_train)
        t_end = time.clock()
        
        t_diff = t_end - t_start
        train_score = classifier.score(X_train, Y_train)
        test_score = classifier.score(X_test, Y_test)
        
        dict_models[classifier_name] = {'model': classifier, 'train_score': train_score, 'test_score': test_score, 'train_time': t_diff}
        if verbose:
            print("trained {c} in {f:.2f} s".format(c=classifier_name, f=t_diff))
    return dict_models
 
 
 
def display_dict_models(dict_models, sort_by='test_score'):
    cls = [key for key in dict_models.keys()]
    test_s = [dict_models[key]['test_score'] for key in cls]
    training_s = [dict_models[key]['train_score'] for key in cls]
    training_t = [dict_models[key]['train_time'] for key in cls]
    
    df_ = pd.DataFrame(data=np.zeros(shape=(len(cls),4)), columns = ['classifier', 'train_score', 'test_score', 'train_time'])
    for ii in range(0,len(cls)):
        df_.loc[ii, 'classifier'] = cls[ii]
        df_.loc[ii, 'train_score'] = training_s[ii]
        df_.loc[ii, 'test_score'] = test_s[ii]
        df_.loc[ii, 'train_time'] = training_t[ii]
    
    display(df_.sort_values(by=sort_by, ascending=False))

In [22]:
dict_models = batch_classify(X_train, Y_train, X_test, Y_test, no_classifiers = 8)
display_dict_models(dict_models)

trained Logistic Regression in 1.02 s
trained Nearest Neighbors in 0.04 s
trained Linear SVM in 0.00 s
trained Gradient Boosting Classifier in 0.37 s
trained Decision Tree in 0.00 s
trained Random Forest in 1.33 s
trained Neural Net in 0.11 s
trained Naive Bayes in 0.00 s


Unnamed: 0,classifier,train_score,test_score,train_time
6,Neural Net,0.433333,0.542857,0.107264
4,Decision Tree,1.0,0.514286,0.000872
1,Nearest Neighbors,0.716667,0.507143,0.043409
3,Gradient Boosting Classifier,1.0,0.5,0.367188
5,Random Forest,1.0,0.5,1.327682
0,Logistic Regression,0.633333,0.492857,1.016784
2,Linear SVM,1.0,0.471429,0.000993
7,Naive Bayes,0.6,0.428571,0.00084
