In [94]:
import pandas as pd
import numpy as np
import seaborn as sns
import time
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import csv

In [122]:
def get_train_test(df, y_col, x_cols, ratio):
    mask=np.random.rand(len(df)) < ratio
    df_train=df[mask]
    df_test=df[~mask]
    
    Y_train=df_train[y_col].values
    Y_test=df_test[y_col].values
    X_train=df_train[x_cols].values
    X_test=df_test[x_cols].values
    return df_train, df_test, X_train, Y_train, X_test, Y_test

    

In [123]:
def batch_classify(X_train, Y_train, X_test, Y_test, no_classifiers=5):
    dict_models={}
    for classifier_name, classifier in list(dict_classifiers.items())[:no_classifiers]:
        classifier.fit(X_train, Y_train)
        train_score=classifier.score(X_train, Y_train)
        test_score=classifier.score(X_test, Y_test)
        
        dict_models[classifier_name]={'model':classifier, 'train_score':train_score, 'test_score':test_score}
    return dict_models

def display_dict(dict_models, sort_by='test_score'):
    cls=[key for key in dict_models.keys()]
    test_s=[dict_models[key]['test_score'] for key in cls]
    training_s=[dict_models[key]['train_score'] for key in cls]
    
    df_=pd.DataFrame(data=np.zeros(shape=(len(cls), 3)), columns=['classifier', 'train_score', 'test_score'])
    for ii in range(0, len(cls)):
        df_.loc[ii, 'classifier']=cls[ii]
        df_.loc[ii, 'train_score']=training_s[ii]
        df_.loc[ii, 'test_score']=test_s[ii]
    
    display(df_.sort_values(by=sort_by, ascending=False))

In [180]:
dict_files = { "Overall set": r"C:\Users\anlan\workspace\gmu-hackathon\1528898320_9518383_train-fin.csv",
    "free sulfur removed": r"C:\Users\anlan\workspace\gmu-hackathon\data_w-o_freeSeulf.csv",
    "total sulfur removed": r"C:\Users\anlan\workspace\gmu-hackathon\data_w-o_totSulf.csv",
    "pH": r"C:\Users\anlan\workspace\gmu-hackathon\data_w-o_pH.csv",
    "pH and outliers":r"C:\Users\anlan\workspace\gmu-hackathon\data_w-o_pH_outliers.csv",
    "citric": r"C:\Users\anlan\workspace\gmu-hackathon\data_w-o_citric.csv",
    "citric and outliers": r"C:\Users\anlan\workspace\gmu-hackathon\data_w-o_citric_outliers.csv",
    "overall and outliers": r"C:\Users\anlan\workspace\gmu-hackathon\data_w-o_outliers.csv"}

In [200]:
def big_func(filepath, user_def_ratio, name):
    wine_df=pd.read_csv(filepath)
    wine_y_col='Quality'
    wine_x_cols=list(wine_df.columns.values)
    wine_x_cols.remove(wine_y_col)
    
    df_train, df_test, X_train, Y_train, X_test, Y_test=get_train_test(wine_df, wine_y_col, wine_x_cols, user_def_ratio)
    dict_models = batch_classify(X_train, Y_train, X_test, Y_test, no_classifiers = 8)
    print(name)
    display_dict(dict_models)
    #samples_split_testing(X_train, Y_train, X_test, Y_test)


In [201]:
dict_classifiers = {
    "Logistic Regression": LogisticRegression(),
    #"Nearest Neighbors": KNeighborsClassifier(),
    "Linear SVM": SVC(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(n_estimators=100),
    "Decision Tree": tree.DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=1000),
    #"Neural Net": MLPClassifier(alpha = 1),
    #"Naive Bayes": GaussianNB(),
    "AdaBoost": AdaBoostClassifier(),
    "QDA": QuadraticDiscriminantAnalysis(),
    "Gaussian Process": GaussianProcessClassifier()
}

In [203]:

user_def_ratio=0.7
for key in dict_files:
    big_func(dict_files[key], user_def_ratio, key)




Overall set


Unnamed: 0,classifier,train_score,test_score
2,Gradient Boosting Classifier,0.941681,0.763889
4,Random Forest,0.833619,0.74537
5,AdaBoost,0.830189,0.740741
1,Linear SVM,0.878216,0.736111
7,Gaussian Process,0.987993,0.722222
0,Logistic Regression,0.715266,0.708333
3,Decision Tree,1.0,0.685185
6,QDA,0.756432,0.685185




free sulfur removed


Unnamed: 0,classifier,train_score,test_score
2,Gradient Boosting Classifier,0.953819,0.783898
4,Random Forest,0.831261,0.766949
0,Logistic Regression,0.699822,0.754237
3,Decision Tree,1.0,0.754237
5,AdaBoost,0.834813,0.75
6,QDA,0.731794,0.728814
1,Linear SVM,0.994671,0.661017
7,Gaussian Process,1.0,0.648305




total sulfur removed


Unnamed: 0,classifier,train_score,test_score
2,Gradient Boosting Classifier,0.942238,0.791837
4,Random Forest,0.830325,0.746939
3,Decision Tree,1.0,0.726531
5,AdaBoost,0.82491,0.726531
0,Logistic Regression,0.698556,0.702041
6,QDA,0.741877,0.685714
1,Linear SVM,0.980144,0.62449
7,Gaussian Process,1.0,0.62449




pH


Unnamed: 0,classifier,train_score,test_score
2,Gradient Boosting Classifier,0.954296,0.769841
4,Random Forest,0.824497,0.746032
5,AdaBoost,0.846435,0.742063
3,Decision Tree,1.0,0.694444
0,Logistic Regression,0.734918,0.68254
6,QDA,0.745887,0.670635
7,Gaussian Process,0.934186,0.650794
1,Linear SVM,0.800731,0.634921




pH and outliers


Unnamed: 0,classifier,train_score,test_score
2,Gradient Boosting Classifier,0.930605,0.788793
4,Random Forest,0.845196,0.758621
0,Logistic Regression,0.713523,0.737069
3,Decision Tree,1.0,0.737069
5,AdaBoost,0.830961,0.737069
6,QDA,0.720641,0.706897
1,Linear SVM,0.822064,0.646552
7,Gaussian Process,0.925267,0.62931




citric


Unnamed: 0,classifier,train_score,test_score
2,Gradient Boosting Classifier,0.93186,0.777344
3,Decision Tree,1.0,0.761719
4,Random Forest,0.832413,0.730469
5,AdaBoost,0.821363,0.722656
6,QDA,0.729282,0.710938
0,Logistic Regression,0.709024,0.707031
1,Linear SVM,0.802947,0.632812
7,Gaussian Process,0.92081,0.632812




citric and outliers


Unnamed: 0,classifier,train_score,test_score
2,Gradient Boosting Classifier,0.932143,0.811966
5,AdaBoost,0.816071,0.777778
4,Random Forest,0.828571,0.769231
3,Decision Tree,1.0,0.739316
0,Logistic Regression,0.698214,0.717949
6,QDA,0.707143,0.679487
1,Linear SVM,0.830357,0.67094
7,Gaussian Process,0.921429,0.65812




overall and outliers


Unnamed: 0,classifier,train_score,test_score
4,Random Forest,0.83829,0.796875
2,Gradient Boosting Classifier,0.929368,0.773438
5,AdaBoost,0.845725,0.75
3,Decision Tree,1.0,0.738281
6,QDA,0.730483,0.726562
0,Logistic Regression,0.715613,0.722656
1,Linear SVM,0.888476,0.71875
7,Gaussian Process,0.983271,0.703125


In [194]:
from sklearn.metrics import roc_curve, auc

def samples_split_testing(X_train, Y_train, X_test, Y_test):
    min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
    train_results = []
    test_results = []
    for min_samples_split in min_samples_splits:
       rf = RandomForestClassifier(min_samples_split=min_samples_split)
       rf.fit(X_train, Y_train)
       train_pred = rf.predict(X_train)
       false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_train, train_pred)
       roc_auc = auc(false_positive_rate, true_positive_rate)
       train_results.append(roc_auc)
       y_pred = rf.predict(X_test)
       false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_test, y_pred)
       roc_auc = auc(false_positive_rate, true_positive_rate)
       test_results.append(roc_auc)
    from matplotlib.legend_handler import HandlerLine2D
    line1, = plt.plot(min_samples_splits, train_results, 'b', label="Train AUC")
    line2, = plt.plot(min_samples_splits, test_results, 'r', label="Test AUC")
    plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
    plt.ylabel('AUC score')
    plt.xlabel('min samples split')
    plt.show()

In [169]:
rf=RandomForestClassifier(n_estimators=1000)
wine_test=pd.read_csv(r"C:\Users\anlan\workspace\gmu-hackathon\1528897205_8734903_test-fin.csv")
rf.fit(X_train_citric, Y_train_citric)
wine_test=wine_test.drop(columns=['citric acid', 'free sulfur dioxide'])
results=rf.predict(wine_test)

In [170]:
np.savetxt(r"C:\Users\anlan\workspace\gmu-hackathon\attempt2.txt", results, fmt="%d")