In [64]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import csv
from IPython.core.display import display, HTML

In [37]:
def preprocess(file_name):
    results = []
    with open(file_name) as csvfile:
        wine_reader = csv.reader(csvfile, dialect="excel", delimiter=';')
        for row in wine_reader:
            results.append(row)
    name_features = results[0]
    #print(name_features)
    parsed_results = []
    for row in results[1:]:
        new_row = []
        for num in row:
            new_row.append(float(num))
        parsed_results.append(new_row)
    parsed_results = np.array(parsed_results)
    n_instances = parsed_results.shape[0]
    n_features = parsed_results.shape[1] - 1
    labels = parsed_results[:, n_features]
    instances = parsed_results[:, : n_features]
    return instances, labels
white_instances, white_labels = preprocess('winequality-white.csv')
red_instances, red_labels = preprocess('winequality-red.csv')
white_two_classes = np.array([0 if num <6 else 1 for num in white_labels])
red_two_classes = np.array([0 if num <6 else 1 for num in red_labels])

In [8]:
def three_classes(wine_labels):
    transformed = []
    for score in wine_labels:
        if score <= 4:
            transformed.append("bad")
        elif score <= 6:
            transformed.append("medium")
        else:
            transformed.append("good")
    return np.array(transformed)
def four_classes(wine_labels):
    transformed = []
    for score in wine_labels:
        if score <= 4:
            transformed.append("bad")
        elif score == 5:
            transformed.append("medium low")
        elif score == 6:
            transformed.append("medium high")
        else:
            transformed.append("good")
    return np.array(transformed)

In [9]:
white_three_classes = three_classes(white_labels)
red_three_classes = three_classes(red_labels)
white_four_classes = four_classes(white_labels)
red_four_classes = four_classes(red_labels)

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression #logistic regression classifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix # for reporting
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler # to normalize data (NN is very sensitive to this!)
from sklearn.model_selection import cross_val_score, GridSearchCV #BONUS

In [29]:
random_forest = RandomForestClassifier()
decision_tree = DecisionTreeClassifier()

In [46]:
def tune(params, clf, instances, labels):
    x_train, x_test, y_train, y_test = train_test_split(instances, labels)
    gs = GridSearchCV(clf, param_grid=params, cv=5)
    gs.fit(x_train, y_train)
    return gs.best_params_

In [60]:
params = {'max_features': [None, 'auto', 'sqrt', 'log2'], 'criterion': ['gini', 'entropy']}
print("DT classifier: \n")
print("best for white (2 classes)", tune(params, DecisionTreeClassifier(), white_instances, white_two_classes))
print("best for white (3 classes)", tune(params, DecisionTreeClassifier(), white_instances, white_three_classes))
print("best for white (4 classes)", tune(params, DecisionTreeClassifier(), white_instances, white_four_classes))
print("\n")
print("best for red (2 classes)", tune(params, DecisionTreeClassifier(), red_instances, red_two_classes))
print("best for red (3 classes)", tune(params, DecisionTreeClassifier(), red_instances, red_three_classes))
print("best for red (4 classes)", tune(params, DecisionTreeClassifier(), red_instances, red_four_classes))

DT classifier: 

best for white (2 classes) {'criterion': 'entropy', 'max_features': 'log2'}
best for white (3 classes) {'criterion': 'entropy', 'max_features': None}
best for white (4 classes) {'criterion': 'entropy', 'max_features': 'sqrt'}


best for red (2 classes) {'criterion': 'entropy', 'max_features': None}
best for red (3 classes) {'criterion': 'gini', 'max_features': 'sqrt'}
best for red (4 classes) {'criterion': 'gini', 'max_features': None}


In [49]:
rfc_params = {'max_features': [None, 'auto', 'sqrt', 'log2'], 'criterion': ['entropy', 'gini']}
print("tune(rfc_params, RandomForestClassifier(), white_instances, white_two_classes)

In [61]:
print("RF classifier: \n")
print("best for white (2 classes)", tune(params, RandomForestClassifier(), white_instances, white_two_classes))
print("best for white (3 classes)", tune(params, RandomForestClassifier(), white_instances, white_three_classes))
print("best for white (4 classes)", tune(params, RandomForestClassifier(), white_instances, white_four_classes))
print("\n")
print("best for red (2 classes)", tune(params, RandomForestClassifier(), red_instances, red_two_classes))
print("best for red (3 classes)", tune(params, RandomForestClassifier(), red_instances, red_three_classes))
print("best for red (4 classes)", tune(params, RandomForestClassifier(), red_instances, red_four_classes))

RF classifier: 

best for white (2 classes) {'criterion': 'entropy', 'max_features': 'sqrt'}
best for white (3 classes) {'criterion': 'gini', 'max_features': 'log2'}
best for white (4 classes) {'criterion': 'entropy', 'max_features': 'log2'}


best for red (2 classes) {'criterion': 'entropy', 'max_features': 'log2'}
best for red (3 classes) {'criterion': 'gini', 'max_features': 'sqrt'}
best for red (4 classes) {'criterion': 'gini', 'max_features': 'log2'}


In [86]:
def produce_report(clf_name, clf, instances, labels):
    x_train, x_test, y_train, y_test = train_test_split(instances, labels)
    clf = clf
    preds = clf.fit(x_train, y_train).predict(x_test)
    display(HTML('<h4>' + clf_name + "'s accuracy: </h4>"))
    print(accuracy_score(y_test, preds))
    display(HTML('<strong>Report</strong>:\n'))
    print(classification_report(y_test, preds), "\n")
    return preds


## Decision Tree Results

In [90]:
display(HTML('<h3>For White Wine: </h3>'))
dt_preds_white_two = produce_report('DT for white wine (2 classes)', DecisionTreeClassifier(max_features='log2', 
                                      criterion='entropy'), white_instances, white_two_classes)
dt_preds_white_three = produce_report('DT for white wine (3 classes)', DecisionTreeClassifier(max_features=None, 
                                      criterion='entropy'), white_instances, white_three_classes)
dt_preds_white_four = produce_report('DT for white wine (4 classes)', DecisionTreeClassifier(max_features='sqrt', 
                                      criterion='entropy'), white_instances, white_four_classes)
display(HTML('<h3>For Red Wine: </h3>'))
dt_preds_red_two = produce_report('DT for red wine (2 classes)', DecisionTreeClassifier(max_features=None, 
                                      criterion='entropy'), red_instances, red_two_classes)
dt_preds_red_three = produce_report('DT for red wine (3 classes)', DecisionTreeClassifier(max_features='sqrt', 
                                      criterion='gini'), red_instances, red_three_classes)
dt_preds_red_four = produce_report('DT for red wine (4 classes)', DecisionTreeClassifier(max_features=None, 
                                      criterion='gini'), red_instances, red_four_classes)


0.763265306122


             precision    recall  f1-score   support

          0       0.65      0.69      0.67       426
          1       0.83      0.80      0.82       799

avg / total       0.77      0.76      0.76      1225
 



0.772244897959


             precision    recall  f1-score   support

        bad       0.24      0.22      0.23        50
       good       0.62      0.60      0.61       278
     medium       0.84      0.86      0.85       897

avg / total       0.77      0.77      0.77      1225
 



0.639183673469


             precision    recall  f1-score   support

        bad       0.32      0.22      0.26        41
       good       0.62      0.67      0.64       275
medium high       0.66      0.65      0.66       541
 medium low       0.65      0.64      0.65       368

avg / total       0.64      0.64      0.64      1225
 



0.7675


             precision    recall  f1-score   support

          0       0.77      0.71      0.74       185
          1       0.77      0.82      0.79       215

avg / total       0.77      0.77      0.77       400
 



0.805


             precision    recall  f1-score   support

        bad       0.12      0.09      0.10        23
       good       0.61      0.59      0.60        58
     medium       0.87      0.90      0.88       319

avg / total       0.79      0.81      0.80       400
 



0.5925


             precision    recall  f1-score   support

        bad       0.14      0.19      0.16        16
       good       0.55      0.53      0.54        57
medium high       0.54      0.59      0.57       145
 medium low       0.72      0.65      0.68       182

avg / total       0.60      0.59      0.60       400
 



## Random Forest Results

In [91]:
display(HTML('<h3>For White Wine: </h3>'))
rf_preds_white_two = produce_report('RF for white wine (2 classes)', RandomForestClassifier(max_features='sqrt', 
                                      criterion='entropy'), white_instances, white_two_classes)
rf_preds_white_three = produce_report('RF for white wine (3 classes)', RandomForestClassifier(max_features='log2', 
                                      criterion='gini'), white_instances, white_three_classes)
rf_preds_white_four = produce_report('RF for white wine (4 classes)', DecisionTreeClassifier(max_features='log2', 
                                      criterion='entropy'), white_instances, white_four_classes)
display(HTML('<h3>For Red Wine: </h3>'))
rf_preds_red_two = produce_report('RF for red wine (2 classes)', RandomForestClassifier(max_features='log2', 
                                      criterion='entropy'), red_instances, red_two_classes)
rf_preds_red_three = produce_report('RF for red wine (3 classes)', RandomForestClassifier(max_features='sqrt', 
                                      criterion='gini'), red_instances, red_three_classes)
rf_preds_red_four = produce_report('RF for red wine (4 classes)', RandomForestClassifier(max_features='log2', 
                                      criterion='gini'), red_instances, red_four_classes)


0.806530612245


             precision    recall  f1-score   support

          0       0.71      0.70      0.71       410
          1       0.85      0.86      0.86       815

avg / total       0.81      0.81      0.81      1225
 



0.834285714286


             precision    recall  f1-score   support

        bad       0.50      0.12      0.20        41
       good       0.68      0.64      0.66       255
     medium       0.87      0.92      0.90       929

avg / total       0.82      0.83      0.82      1225
 



0.61306122449


             precision    recall  f1-score   support

        bad       0.25      0.30      0.27        40
       good       0.59      0.63      0.61       260
medium high       0.65      0.63      0.64       555
 medium low       0.62      0.61      0.62       370

avg / total       0.62      0.61      0.61      1225
 



0.8375


             precision    recall  f1-score   support

          0       0.81      0.86      0.84       192
          1       0.86      0.82      0.84       208

avg / total       0.84      0.84      0.84       400
 



0.8725


             precision    recall  f1-score   support

        bad       0.00      0.00      0.00        10
       good       0.70      0.55      0.62        56
     medium       0.90      0.95      0.93       334

avg / total       0.85      0.87      0.86       400
 



0.6625


             precision    recall  f1-score   support

        bad       0.20      0.08      0.11        13
       good       0.71      0.52      0.60        56
medium high       0.61      0.68      0.64       158
 medium low       0.72      0.73      0.73       173

avg / total       0.66      0.66      0.66       400
 

