In [12]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from collections import Counter

from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# Read-in data

In [4]:
data_path = "data/"
raw_df = pd.read_csv(data_path + "PreprocessedData.csv")
raw_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,targets
0,-0.523281,1.074692,-1.376063,-0.645049,-0.172968,-0.450943,-0.30841,0.766555,1.408289,-0.621063,-0.990272,1
1,-0.249565,2.166691,-1.376063,0.938581,1.364655,1.148185,0.966059,0.142986,-0.942364,0.410431,-0.580112,1
2,-0.249565,1.438691,-1.154012,0.259883,0.945303,0.005951,0.463995,0.2677,-0.487399,0.152557,-0.580112,1
3,2.077023,-1.473307,1.732651,-0.645049,-0.24286,0.234398,0.695717,0.891269,-1.245674,-0.449148,-0.580112,1
4,-0.523281,1.074692,-1.376063,-0.645049,-0.172968,-0.450943,-0.30841,0.766555,1.408289,-0.621063,-0.990272,1


In [5]:
df = raw_df.copy()

# Split Data

In [6]:
# Isolate features
features = df.iloc[:,:-1]
features

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,-0.523281,1.074692,-1.376063,-0.645049,-0.172968,-0.450943,-0.308410,0.766555,1.408289,-0.621063,-0.990272
1,-0.249565,2.166691,-1.376063,0.938581,1.364655,1.148185,0.966059,0.142986,-0.942364,0.410431,-0.580112
2,-0.249565,1.438691,-1.154012,0.259883,0.945303,0.005951,0.463995,0.267700,-0.487399,0.152557,-0.580112
3,2.077023,-1.473307,1.732651,-0.645049,-0.242860,0.234398,0.695717,0.891269,-1.245674,-0.449148,-0.580112
4,-0.523281,1.074692,-1.376063,-0.645049,-0.172968,-0.450943,-0.308410,0.766555,1.408289,-0.621063,-0.990272
...,...,...,...,...,...,...,...,...,...,...,...
1185,-1.344430,0.468025,-0.931961,-0.418816,0.805519,1.947749,0.077792,-1.041795,0.953324,-0.449148,0.137668
1186,-1.549717,0.164692,-0.820936,0.033650,-1.151455,2.747313,0.348134,-0.904610,1.484117,1.098093,0.855447
1187,-1.276001,-0.077974,-0.654398,0.259883,-0.172968,1.605079,-0.076689,-0.517997,0.725841,1.012136,0.650368
1188,-1.549717,0.741025,-0.709910,-0.418816,-0.242860,1.947749,0.077792,-0.686361,1.863254,0.668304,-0.169952


In [7]:
# Isolate Targets
targets = df["targets"]
targets

0       1
1       1
2       1
3       1
4       1
       ..
1185    1
1186    1
1187    1
1188    1
1189    1
Name: targets, Length: 1190, dtype: int64

In [10]:
# Split Data
features_train, features_test, targets_train, targets_test = train_test_split(features, targets, test_size=0.2, random_state=42)

In [13]:
# Look at how many samples are in train and test datasets. 
train_counter = Counter(targets_train)
test_counter = Counter(targets_test)

print("Train samples:",features_train.shape[0])
print("Below Average Wines:", train_counter[0])
print("Average Wines:", train_counter[1])
print("Above Average Wines:", train_counter[2])
print("\n-------------------------------\n")
print("Test samples:",features_test.shape[0])
print("Below Average Wines:", test_counter[0])
print("Average Wines:", test_counter[1])
print("Above Average Wines:", test_counter[2])

Train samples: 952
Below Average Wines: 31
Average Wines: 798
Above Average Wines: 123

-------------------------------

Test samples: 238
Below Average Wines: 5
Average Wines: 210
Above Average Wines: 23


# Machine Learning

In [None]:
hyper_parameters = {"n_estimators": [2,50,100,150],
                    "base_estimator":[DecisionTreeClassifier(max_depth=2),DecisionTreeClassifier(max_depth=5)]
                   }
scorer = make_scorer(f1_score, average="micro") # micro calculates metrics globally (not an average like macro)
grid = GridSearchCV(AdaBoostClassifier(random_state=42), param_grid=hyper_parameters, scoring=scorer)
fit_grid = grid.fit(features_train,targets_train)
clf = fit_grid.best_estimator_

print("Best Hyper-parameters: \n", grid.best_params_)
print("\n--------------------------------------------------------------------------------------------------\n")
print("Model after hyper-parameter tuning: \n",grid.best_estimator_)

In [None]:
# Cross-Validate the best classifier
scores_train = cross_val_score(clf, features, targets, cv=10, scoring=scorer)
print("\nf-scores: ",scores_train)
print("\nAverage f-score: ",scores_train.mean())

We will still calculate accuracy, but it will not be the most useful metric because our data is unbalanced. The majority of our samples fall in the "Average"(5-6) class. So, we will rely on the f-score

In [None]:
# How well our model fits the training data
train_predict = clf.predict(features_train)
train_acc = accuracy_score(targets_train,train_predict)
train_f_score = f1_score(targets_train, train_predict, average="micro")
print("Accuracy:",np.round(train_acc,3))
print("f-score:",np.round(train_f_score,3))

### Check Performance of Classifier

In [None]:
predict = clf.predict(features_test)
acc = accuracy_score(targets_test,predict)
f_score = f1_score(targets_test, predict, average="micro")

In [None]:
print("Accuracy:",np.round(acc,3))
print("f-score:",np.round(f_score,3))

Note: When I refer to "accuracy" in the below text, I am talking about both the accuracy and the f-score. A more appropriate term might be "performance", but that is a bit ambiguous when siting the numbers.

I'm having trouble determining if I should use the hyperparameters that give the most accurate results (0.92) but with a large difference between the train and test accuracies (|0.999 - 0.92| = 0.079) OR if I should use the model with lower accuracy (0.845) but with the lower difference (|0.848 - 0.857| = 0.009).

I can't figure out why the difference between the training accuracy and testing accuracy increases, while the testing accuracy is also increasing. It appears to be overfitting to the training data with an accuracy of 0.999, but the testing accuracy didn't decrease, it increased. This makes me wonder if it has something to do with how unbalanced the dataset is. Perhaps the abundance of "Average" wine is making the model look better than it actually is.

I've decided to use the more accurate model until I can think of the reason as to why the difference grew larger, but the testing accuracy still increased.

## Check Which Features Are Most Important

In [None]:
importance_features = clf.feature_importances_
print(importance_features)

In [None]:
column_names = data_no_outliers.columns.values[:-1]
summary_df = pd.DataFrame(column_names,columns=["Features"])
summary_df["Importance"] = importance_features
summary_df = summary_df.sort_values(by=["Importance"], ascending=False).reset_index(drop=True)
summary_df

In [None]:
fig = plt.figure(figsize=(20,10))
sns.barplot(x="Features",y="Importance", color="dodgerblue" ,data=summary_df)

Final thoughts:

I'm skeptical about the fact that the accuracy and f-score returned the same exact number because there is much more "average" quality wine, so I would have expected the accuracy to be higher than the f-score. If I had the opportunity, I'd like to update this model with more data on the "Below Average" and "Above Average" wines. I am also skeptical about the problem regarding the test and training accuracies that I outlined above in the "Check Performance of Classifier" section. According to my model, the most important features are sulphates, volatile acidity, and alcohol content. This information could be used by wineries to make higher quality wine