In [49]:
import pandas as pd
import numpy as np
import seaborn as sns
import time
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import csv

In [57]:
wine_df=pd.read_csv(r"C:\Users\anlan\workspace\gmu-hackathon\1528898320_9518383_train-fin.csv")
wine_test=pd.read_csv(r"C:\Users\anlan\workspace\gmu-hackathon\1528897205_8734903_test-fin.csv")
def get_train_test(df, y_col, x_cols, ratio):
    mask=np.random.rand(len(df)) < ratio
    df_train=df[mask]
    df_test=df[~mask]
    
    Y_train=df_train[y_col].values
    Y_test=df_test[y_col].values
    X_train=df_train[x_cols].values
    X_test=df_test[x_cols].values
    return df_train, df_test, X_train, Y_train, X_test, Y_test

    

In [33]:
def batch_classify(X_train, Y_train, X_test, Y_test, no_classifiers=5):
    dict_models={}
    for classifier_name, classifier in list(dict_classifiers.items())[:no_classifiers]:
        classifier.fit(X_train, Y_train)
        train_score=classifier.score(X_train, Y_train)
        test_score=classifier.score(X_test, Y_test)
        
        dict_models[classifier_name]={'model':classifier, 'train_score':train_score, 'test_score':test_score}
    return dict_models

def display_dict(dict_models, sort_by='test_score'):
    cls=[key for key in dict_models.keys()]
    test_s=[dict_models[key]['test_score'] for key in cls]
    training_s=[dict_models[key]['train_score'] for key in cls]
    
    df_=pd.DataFrame(data=np.zeros(shape=(len(cls), 3)), columns=['classifier', 'train_score', 'test_score'])
    for ii in range(0, len(cls)):
        df_.loc[ii, 'classifier']=cls[ii]
        df_.loc[ii, 'train_score']=training_s[ii]
        df_.loc[ii, 'test_score']=test_s[ii]
    
    display(df_.sort_values(by=sort_by, ascending=False))

In [34]:
wine_y_col='Quality'
wine_x_cols=list(wine_df.columns.values)
wine_x_cols.remove(wine_y_col)
user_def_ratio=.7
df_train, df_test, X_train, Y_train, X_test, Y_test=get_train_test(wine_df, wine_y_col, wine_x_cols, user_def_ratio)

In [55]:
dict_classifiers = {
    "Logistic Regression": LogisticRegression(),
    #"Nearest Neighbors": KNeighborsClassifier(),
    "Linear SVM": SVC(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(n_estimators=1000),
    "Decision Tree": tree.DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=1000),
    #"Neural Net": MLPClassifier(alpha = 1),
    #"Naive Bayes": GaussianNB(),
    "AdaBoost": AdaBoostClassifier(),
    "QDA": QuadraticDiscriminantAnalysis(),
    "Gaussian Process": GaussianProcessClassifier()
}

In [56]:
dict_models = batch_classify(X_train, Y_train, X_test, Y_test, no_classifiers = 8)
display_dict(dict_models)



Unnamed: 0,classifier,train_score,test_score
4,Random Forest,1.0,0.786957
3,Decision Tree,1.0,0.76087
2,Gradient Boosting Classifier,1.0,0.73913
7,Gaussian Process,0.98594,0.717391
1,Linear SVM,0.87522,0.704348
0,Logistic Regression,0.724077,0.7
6,QDA,0.72935,0.682609
5,AdaBoost,0.841828,0.678261


In [None]:
n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]
train_results = []
test_results = []
for estimator in n_estimators:
   rf = RandomForestClassifier(n_estimators=estimator, n_jobs=-1)
   rf.fit(X_train, Y_train)
   train_pred = rf.predict(X_train)
   false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_train, train_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   train_results.append(roc_auc)
   y_pred = rf.predict(X_test)
   false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_test, y_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(n_estimators, train_results, ‘b’, label=”Train AUC”)
line2, = plt.plot(n_estimators, test_results, ‘r’, label=”Test AUC”)
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel(‘AUC score’)
plt.xlabel(‘n_estimators’)
plt.show()

In [54]:
min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
train_results = []
test_results = []
for min_samples_split in min_samples_splits:
   rf = RandomForestClassifier(min_samples_split=min_samples_split)
   rf.fit(X_train, Y_train)
   train_pred = rf.predict(X_train)
   false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_train, train_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   train_results.append(roc_auc)
   y_pred = rf.predict(X_test)
   false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_test, y_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(min_samples_splits, train_results, 'b', label="Train AUC")
line2, = plt.plot(min_samples_splits, test_results, 'r', label="Test AUC")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('AUC score')
plt.xlabel('min samples split')
plt.show()



NameError: name 'roc_curve' is not defined

In [59]:
rf=RandomForestClassifier(n_estimators=1000)
rf.fit(X_train, Y_train)
results=rf.predict(wine_test)

In [64]:
np.savetxt(r"C:\Users\anlan\workspace\gmu-hackathon\attempt1.txt", results, fmt="%d")