In [226]:
import pandas as pd
import numpy as np
import seaborn as sns
import time
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import csv

In [227]:
def get_train_test(df, y_col, x_cols, ratio):
    mask=np.random.rand(len(df)) < ratio
    df_train=df[mask]
    df_test=df[~mask]
    
    Y_train=df_train[y_col].values
    Y_test=df_test[y_col].values
    X_train=df_train[x_cols].values
    X_test=df_test[x_cols].values
    return df_train, df_test, X_train, Y_train, X_test, Y_test

    

In [244]:
def batch_classify(X_train, Y_train, X_test, Y_test, no_classifiers=5):
    dict_models={}
    train_score=0
    test_score=0
    for classifier_name, classifier in list(dict_classifiers.items())[:no_classifiers]:
        for i in range (0, 10):
            classifier.fit(X_train, Y_train)
            train_score+=classifier.score(X_train, Y_train)
            test_score+=classifier.score(X_test, Y_test)
        train_score/=10
        test_score/=10
        dict_models[classifier_name]={'model':classifier, 'train_score':train_score, 'test_score':test_score}
    return dict_models

def display_dict(dict_models, sort_by='test_score'):
    cls=[key for key in dict_models.keys()]
    test_s=[dict_models[key]['test_score'] for key in cls]
    training_s=[dict_models[key]['train_score'] for key in cls]
    
    df_=pd.DataFrame(data=np.zeros(shape=(len(cls), 3)), columns=['classifier', 'train_score', 'test_score'])
    for ii in range(0, len(cls)):
        df_.loc[ii, 'classifier']=cls[ii]
        df_.loc[ii, 'train_score']=training_s[ii]
        df_.loc[ii, 'test_score']=test_s[ii]
    
    display(df_.sort_values(by=sort_by, ascending=False))

In [245]:
dict_files = { "Overall set": r"C:\Users\anlan\workspace\gmu-hackathon\1528898320_9518383_train-fin.csv",
    "free sulfur removed": r"C:\Users\anlan\workspace\gmu-hackathon\data_w-o_freeSeulf.csv",
    "total sulfur removed": r"C:\Users\anlan\workspace\gmu-hackathon\data_w-o_totSulf.csv",
    "pH": r"C:\Users\anlan\workspace\gmu-hackathon\data_w-o_pH.csv",
    "pH and outliers":r"C:\Users\anlan\workspace\gmu-hackathon\data_w-o_pH_outliers.csv",
    "citric": r"C:\Users\anlan\workspace\gmu-hackathon\data_w-o_citric.csv",
    "citric and outliers": r"C:\Users\anlan\workspace\gmu-hackathon\data_w-o_citric_outliers.csv",
    "overall and outliers": r"C:\Users\anlan\workspace\gmu-hackathon\data_w-o_outliers.csv"}

In [246]:
def big_func(filepath, user_def_ratio, name):
    wine_df=pd.read_csv(filepath)
    wine_y_col='Quality'
    wine_x_cols=list(wine_df.columns.values)
    wine_x_cols.remove(wine_y_col)
    
    df_train, df_test, X_train, Y_train, X_test, Y_test=get_train_test(wine_df, wine_y_col, wine_x_cols, user_def_ratio)
    dict_models = batch_classify(X_train, Y_train, X_test, Y_test, no_classifiers = 8)
    print(name)
    display_dict(dict_models)
    #samples_leaf_testing(X_train, Y_train, X_test, Y_test)


In [247]:
dict_classifiers = {
    "Logistic Regression": LogisticRegression(),
    #"Nearest Neighbors": KNeighborsClassifier(),
    "Linear SVM": SVC(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(n_estimators=100),
    "Decision Tree": tree.DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=1000),
    #"Neural Net": MLPClassifier(alpha = 1),
    #"Naive Bayes": GaussianNB(),
    "AdaBoost": AdaBoostClassifier(),
    "QDA": QuadraticDiscriminantAnalysis(),
    "Gaussian Process": GaussianProcessClassifier()
}

In [248]:
user_def_ratio=0.7
for key in dict_files:
    big_func(dict_files[key], user_def_ratio, key)



Overall set


Unnamed: 0,classifier,train_score,test_score
4,Random Forest,1.110457,0.890184
2,Gradient Boosting Classifier,1.045744,0.860482
5,AdaBoost,0.944671,0.830246
6,QDA,0.828267,0.797937
3,Decision Tree,1.104574,0.796575
1,Linear SVM,0.965324,0.767105
0,Logistic Regression,0.704028,0.741228
7,Gaussian Process,1.061811,0.728916




free sulfur removed


Unnamed: 0,classifier,train_score,test_score
4,Random Forest,1.110693,0.873554
2,Gradient Boosting Classifier,1.069253,0.798397
5,AdaBoost,0.952706,0.791997
3,Decision Tree,1.106925,0.781949
6,QDA,0.831925,0.775402
7,Gaussian Process,1.083193,0.714671
1,Linear SVM,1.07331,0.701266
0,Logistic Regression,0.733096,0.64135




total sulfur removed


Unnamed: 0,classifier,train_score,test_score
4,Random Forest,1.110405,0.868994
2,Gradient Boosting Classifier,1.040478,0.843052
3,Decision Tree,1.104048,0.812005
5,AdaBoost,0.92674,0.809904
6,QDA,0.81793,0.780521
0,Logistic Regression,0.720137,0.690141
7,Gaussian Process,1.081793,0.683686
1,Linear SVM,1.053242,0.679343




pH


Unnamed: 0,classifier,train_score,test_score
4,Random Forest,1.11012,0.857469
2,Gradient Boosting Classifier,1.012021,0.833227
5,AdaBoost,0.929665,0.82211
3,Decision Tree,1.101202,0.792868
6,QDA,0.811447,0.786757
1,Linear SVM,0.880138,0.782273
7,Gaussian Process,1.006879,0.765039
0,Logistic Regression,0.71848,0.731818




pH and outliers


Unnamed: 0,classifier,train_score,test_score
4,Random Forest,1.110264,0.827653
2,Gradient Boosting Classifier,1.026381,0.810315
7,Gaussian Process,1.02347,0.799792
5,AdaBoost,0.965921,0.789972
3,Decision Tree,1.102638,0.781032
6,QDA,0.846592,0.745664
1,Linear SVM,0.875699,0.729279
0,Logistic Regression,0.732517,0.671171




citric


Unnamed: 0,classifier,train_score,test_score
4,Random Forest,1.109948,0.918624
2,Gradient Boosting Classifier,0.994808,0.871333
3,Decision Tree,1.099481,0.830689
5,AdaBoost,0.922841,0.825196
1,Linear SVM,0.854007,0.815556
6,QDA,0.799601,0.798075
7,Gaussian Process,0.998079,0.78203
0,Logistic Regression,0.700348,0.733333




citric and outliers


Unnamed: 0,classifier,train_score,test_score
2,Gradient Boosting Classifier,1.008678,0.898988
4,Random Forest,1.110087,0.888912
5,AdaBoost,0.928513,0.851537
6,QDA,0.780002,0.808889
3,Decision Tree,1.100868,0.780171
0,Logistic Regression,0.681564,0.754864
7,Gaussian Process,1.005375,0.746259
1,Linear SVM,0.868901,0.729183




overall and outliers


Unnamed: 0,classifier,train_score,test_score
4,Random Forest,1.110285,0.887485
2,Gradient Boosting Classifier,1.028484,0.861
5,AdaBoost,0.950379,0.834582
3,Decision Tree,1.102848,0.829017
7,Gaussian Process,1.066363,0.785429
1,Linear SVM,0.952708,0.7725
6,QDA,0.826085,0.770958
0,Logistic Regression,0.736462,0.683333


In [215]:
from sklearn.metrics import roc_curve, auc

def samples_leaf_testing(X_train, Y_train, X_test, Y_test):
    min_samples_leafs = np.linspace(0.1, 0.5, 5, endpoint=True)
    train_results = []
    test_results = []
    for min_samples_leaf in min_samples_leafs:
       rf = RandomForestClassifier(min_samples_leaf=min_samples_leaf)
       rf.fit(X_train, Y_train)
       train_pred = rf.predict(X_train)
       false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_train, train_pred)
       roc_auc = auc(false_positive_rate, true_positive_rate)
       train_results.append(roc_auc)
       y_pred = rf.predict(X_test)
       false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_test, y_pred)
       roc_auc = auc(false_positive_rate, true_positive_rate)
       test_results.append(roc_auc)
    from matplotlib.legend_handler import HandlerLine2D
    line1, = plt.plot(min_samples_leafs, train_results, 'b', label="Train AUC")
    line2, = plt.plot(min_samples_leafs, test_results, 'r', label="Test AUC")
    plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
    plt.ylabel("AUC score")
    plt.xlabel("min samples leaf")
    plt.show()

In [256]:
rf=RandomForestClassifier(n_estimators=1000)
wine_train=pd.read_csv(r"C:\Users\anlan\workspace\gmu-hackathon\data_w-o_citric.csv")
wine_test=pd.read_csv(r"C:\Users\anlan\workspace\gmu-hackathon\1528897205_8734903_test-fin.csv")
wine_test=wine_test.drop(columns=['citric acid'])
wine_y_col='Quality'
wine_x_cols=list(wine_train.columns.values)
print(wine_train)
wine_x_cols.remove(wine_y_col)
df_train, df_test, X_train, Y_train, X_test, Y_test=get_train_test(wine_train, wine_y_col, wine_x_cols, 1.0)
rf.fit(X_train, Y_train)
results=rf.predict(wine_test)

     fixed acidity  volatile acidity  residual sugar  chlorides  \
0              7.4             0.700             1.9      0.076   
1              7.8             0.880             2.6      0.098   
2              7.8             0.760             2.3      0.092   
3             11.2             0.280             1.9      0.075   
4              7.4             0.700             1.9      0.076   
5              7.4             0.660             1.8      0.075   
6              7.9             0.600             1.6      0.069   
7              7.3             0.650             1.2      0.065   
8              7.8             0.580             2.0      0.073   
9              7.5             0.500             6.1      0.071   
10             6.7             0.580             1.8      0.097   
11             7.5             0.500             6.1      0.071   
12             5.6             0.615             1.6      0.089   
13             7.8             0.610             1.6      0.11

In [257]:
np.savetxt(r"C:\Users\anlan\workspace\gmu-hackathon\attempt3.txt", results, fmt="%d")