# Vivino predictor of wine quality based on classification algorithms

In [19]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, ElasticNet

In [2]:
df = pd.read_csv('winequality-red.csv')

# Cleaning and EDA

In [3]:
# Scale the data
scaler = StandardScaler()

In [4]:
X = df
y = X.pop('quality')

In [5]:
Xs = scaler.fit_transform(X)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(Xs,y,test_size=0.2)

# Model Development - Classification models
In theory, you could cluster the data to better predict it due to the fact that there are multiple factors that make up the quality of the wine. Tbis may be easily described through a forest or a decision tree

In [7]:
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train,y_train)
tree_clf.score(X_train,y_train)
#Major overfit - worth using?

1.0

In [8]:
r4_clf = RandomForestClassifier()
r4_clf.fit(X_train,y_train)
r4_clf.score(X_train,y_train)
#again a major overfit - worth using?

1.0

In [31]:
bagg_clf = BaggingClassifier()
bag = bagg_clf.fit(X_train,y_train)
bagg_clf.score(X_train,y_train)
#Sweet - this is a good model.

0.982017200938233

In [10]:
b00st_clf = AdaBoostClassifier()
b00st_clf.fit(X_train,y_train)
b00st_clf.score(X_train,y_train)
# Meh - not ideal

0.5527756059421423

In [11]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train,y_train)
knn_clf.score(X_train,y_train)
# A bit better than the last one but still not so great

0.7028928850664582

In [12]:
log_reg = LogisticRegression(max_iter=1000000)
log_reg.fit(X_train,y_train)
log_reg.score(X_train,y_train)
#Woof

0.5957779515246286

In [13]:
vote_clf = VotingClassifier(estimators=[('lr', log_reg), ('knn', knn_clf),('bagg', bagg_clf)],voting='soft')
vote_clf.fit(X_train,y_train)
vote_clf.score(X_train,y_train)
#Good mix of the models, successful vote.

0.8717748240813136

In [14]:
vote_clf = VotingClassifier(estimators=[('r4', r4_clf), ('tree', tree_clf), ('knn', knn_clf),])
vote_clf.fit(X_train,y_train)
vote_clf.score(X_train,y_train)
# Still overfitting...

1.0

In [24]:
feature_importances = np.mean([
    tree.feature_importances_ for tree in bagg_clf.estimators_
], axis=0)

In [28]:
feature_importances

#Appears as though the important features in this model are the volatile acididty in a bottle of wine along with the alcohol percentage and 
#sulphate content. These three items are the greatest determinents in whehther or not a bottle of wine is seen as a good bottle. (2,10,11)

array([0.08504467, 0.10342894, 0.0677458 , 0.07124033, 0.07510161,
       0.05368326, 0.09642202, 0.07879119, 0.07446533, 0.11048265,
       0.1835942 ])