In [77]:
import numpy as np
import matplotlib.pyplot as plt
import csv

In [56]:
def preprocess(file_name):
    results = []
    with open(file_name) as csvfile:
        wine_reader = csv.reader(csvfile, dialect="excel", delimiter=';')
        for row in wine_reader:
            results.append(row)
    name_features = results[0]
    #print(name_features)
    parsed_results = []
    for row in results[1:]:
        new_row = []
        for num in row:
            new_row.append(float(num))
        parsed_results.append(new_row)
    parsed_results = np.array(parsed_results)
    n_instances = parsed_results.shape[0]
    n_features = parsed_results.shape[1] - 1
    labels = parsed_results[:, n_features]
    instances = parsed_results[:, : n_features]
    return instances, labels
white_instances, white_labels = preprocess('winequality-white.csv')
red_instances, red_labels = preprocess('winequality-red.csv')
transformed_white_labels = np.array([0 if num <6 else 1 for num in white_labels])
transformed_red_labels = np.array([0 if num <6 else 1 for num in red_labels])

In [89]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression #logistic regression classifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix # for reporting

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler # to normalize data (NN is very sensitive to this!)
from sklearn.model_selection import cross_val_score, GridSearchCV #BONUS

In [58]:
X_train_white, X_test_white, y_train_white, y_test_white = train_test_split(white_instances, transformed_white_labels)

In [57]:
X_train_red, X_test_red, y_train_red, y_test_red = train_test_split(red_instances, transformed_red_labels)

In [66]:
random_forest = RandomForestClassifier()
decision_tree = DecisionTreeClassifier()

In [84]:
random_forest_red = random_forest.fit(X_train_red, y_train_red)
decision_tree_red = decision_tree.fit(X_train_red, y_train_red)
def report_accuracy(moment, clf_name, clf, x_train, y_train):
    cv_error = cross_val_score(clf, x_train, y_train, scoring="accuracy", cv=10)
    print(moment + " customisation, average accuracy of " + clf_name + " : ", cv_error.mean(), " standard deviation: ", np.std(cv_error))
report_accuracy("Before", "RFC for red wine", random_forest_red, X_train_red, y_train_red)  
report_accuracy("Before", "DT for red wine", decision_tree_red, X_train_red, y_train_red) 

Before customisation, average accuracy of RFC for red wine :  0.792285459638  standard deviation:  0.0309506454445
Before customisation, average accuracy of DT for red wine :  0.749762599255  standard deviation:  0.0283700190396


In [74]:
random_forest_white = random_forest.fit(X_train_white, y_train_white)
decision_tree_white = decision_tree.fit(X_train_white, y_train_white)
report_accuracy("Before", "RFC for white wine", random_forest_white, X_train_white, y_train_white)
report_accuracy("Before", "DT for white wine", decision_tree_white, X_train_white, y_train_white)


Before customisation, average accuracy of RFC for white wine :  0.816215961094  standard deviation:  0.0149786910191
Before customisation, average accuracy of DT for white wine :  0.778923197534  standard deviation:  0.0158219755864


In [100]:
tree_params = {'max_features': [None, 'auto', 'sqrt', 'log2']}
gs = GridSearchCV(DecisionTreeClassifier(), param_grid=tree_params, cv=10)
gs.fit(X_train_red, y_train_red)
gs.best_params_

{'max_features': 'sqrt'}

In [101]:
rfc_params = {'max_features': [None, 'auto', 'sqrt', 'log2'], 'criterion': ['entropy', 'gini']}
gs2 = GridSearchCV(RandomForestClassifier(), param_grid=rfc_params, cv=10)
gs2.fit(X_train_red, y_train_red)
gs2.best_params_

{'criterion': 'entropy', 'max_features': 'log2'}

In [104]:
rvc_customised = RandomForestClassifier(max_features='log2', criterion='entropy')
def produce_report(clf_name, clf, x_train, y_train, x_test, y_test):
    preds = clf.fit(x_train, y_train).predict(x_test)
    print(clf_name + "'s accuracy: ", accuracy_score(y_test, preds))
    print("Report for " + clf_name + " :\n", classification_report(y_test, preds))
    return preds
rfc_preds_white = produce_report('RFC for white wine', rvc_customised, X_train_white, y_train_white, X_test_white, y_test_white)
rfc_preds_red = produce_report('RFC for red wine', rvc_customised, X_train_red, y_train_red, X_test_red, y_test_red)

RFC for white wine's accuracy:  0.819591836735
Report for RFC for white wine :
              precision    recall  f1-score   support

          0       0.75      0.72      0.74       426
          1       0.85      0.87      0.86       799

avg / total       0.82      0.82      0.82      1225

RFC for red wine's accuracy:  0.795
Report for RFC for red wine :
              precision    recall  f1-score   support

          0       0.73      0.85      0.79       179
          1       0.86      0.75      0.80       221

avg / total       0.80      0.80      0.80       400

