# Program that predicts a wine quality

In [83]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
wine = pd.read_csv("winedataset.csv")
wine.head()

In [None]:
wine.shape

In [None]:
wine.dtypes

In [None]:
wine.describe()

In [None]:
wine.info()

In [None]:
wine.columns

In [None]:
# There was a weird name for Proline with a lot of spaces trailing it, removing it.
wine.rename(columns={'Proline    ': 'Proline'}, inplace=True)
print(wine.columns)

In [None]:
cols = ['Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash',
       'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols',
       'Proanthocyanins', 'Color intensity', 'Hue', 'diluted wines',
       'Proline']
nrow=4
ncol=4
fig, axs = plt.subplots(nrow,ncol,figsize=(20,20))
for i in range(nrow):
    for j in range(ncol):
        ax = axs[i][j]
        r = i*ncol+j
        if r < (nrow*ncol):
            #sns.countplot(wine[cols[r]], hue=wine['Class'],ax=ax)
            sns.boxplot(wine[cols[r]],ax=ax, orient='v')
plt.tight_layout()

In [None]:
nrow=4
ncol=4
fig, axs = plt.subplots(nrow,ncol,figsize=(20,20))
for i in range(nrow):
    for j in range(ncol):
        ax = axs[i][j]
        r = i*ncol+j
        if r < (nrow*ncol):
            sns.countplot(wine[cols[r]], hue=wine['Class'],ax=ax)
            #sns.boxplot(wine[cols[r]],ax=ax, orient='v')
plt.tight_layout()

In [None]:
# Correlation among columns
corr = wine.corr()
corr['Class'].sort_values(ascending=False)

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(corr,annot=True, linewidth=0)

In [None]:
# Checking null values
wine.isnull().sum()

In [None]:
# Reaming all the columns to make it consistent and without blank spaces.
wine.rename(columns = {'Class': 'class', 'Alcohol': 'alcohol', 'Malic acid': 'malic_acid', 'Ash': 'ash', 
        'Alcalinity of ash': 'alcalinity_of_ash','Magnesium': 'magnesium' , 'Total phenols': 'total_phenols',
        'Flavanoids': 'flavanoids' ,'Nonflavanoid phenols': 'nonflavanoid_phenols', 'Proanthocyanins': 'proanthocyanins', 
        'Color intensity': 'color_intensity','Hue': 'hue' , 'diluted wines': 'diluted_wines', 'Proline': 'proline'}
        , inplace=True)
wine.head()

In [None]:
wine['class'].unique()

In [None]:
# see how many counts are there for each wine class
wine['class'].value_counts()

In [None]:
# create a list of our conditions
conditions = [
    (wine['class'] == 1), 
    (wine['class'] == 2), 
    (wine['class'] == 3)
    ]

# create a list of the values we want to assign for each condition
values = ['tier_1', 'tier_2', 'tier_3']

# create a new column and use np.select to assign values to it using our lists as arguments
wine['quality'] = np.select(conditions, values)
wine.head()

In [None]:
plt.scatter(wine['class'], wine['alcohol'])

In [69]:
x = wine.drop(['class', 'quality'], axis="columns")
y = wine['class']
x.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,diluted_wines,proline
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [94]:
print(y.head())
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC

0    1
1    1
2    1
3    1
4    1
Name: class, dtype: int64


In [107]:
def classify(model, x, y):
    x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=.3, random_state=101)
    # Model training
    model.fit(x_train, y_train)
    print("Accuracy score is:",model.score(x_train, y_train))
    #Cross validation
    score = cross_val_score(model, x,y, cv=5)
    print("Cross validation score is:",np.mean(score))
    return np.mean(score)

In [109]:
model1 = DecisionTreeClassifier()
model2 = KNeighborsClassifier()
model3 = ExtraTreesClassifier()
model4 = SVC(kernel='rbf')
model5 = SVC(kernel='poly')
model6 = SVC(kernel='linear')
model7 = SVC(kernel='sigmoid')

model = [model1, model2, model3, model4, model5, model6, model7]
model_names = ['DecisionTree Model', 'KNN Model', 'ExtraTrees Model', 'SVC rbf Model',
              'SVC poly Model', 'SVC linear Model', 'SVC sigmoid Model']
c_scores = []
for i in model:
    c_scores.append(classify(i, x, y))

Accuracy score is: 1.0
Cross validation score is: 0.8985714285714286
Accuracy score is: 0.8145161290322581
Cross validation score is: 0.6912698412698413
Accuracy score is: 1.0
Cross validation score is: 0.9888888888888889
Accuracy score is: 0.7580645161290323
Cross validation score is: 0.6634920634920635
Accuracy score is: 0.6612903225806451
Cross validation score is: 0.6522222222222223
Accuracy score is: 0.9919354838709677
Cross validation score is: 0.961111111111111
Accuracy score is: 0.1935483870967742
Cross validation score is: 0.1638095238095238


In [110]:
c_scores

[0.8985714285714286,
 0.6912698412698413,
 0.9888888888888889,
 0.6634920634920635,
 0.6522222222222223,
 0.961111111111111,
 0.1638095238095238]

In [112]:
print("Best model for wine is:",model_names[c_scores.index(max(c_scores))])

Best model for wine is: ExtraTrees Model


In [116]:
def pred(model, x, y):
    x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=.3, random_state=101)
    # Model training
    pred.predict(x_test)
    print("Accuracy score is:",model.score(x_test, y_test))
    return model.score(x_test, y_test)

In [117]:
pred_scores = []
for i in model:
    pred_scores.append(classify(i, x, y))

Accuracy score is: 1.0
Cross validation score is: 0.8709523809523809
Accuracy score is: 0.8145161290322581
Cross validation score is: 0.6912698412698413
Accuracy score is: 1.0
Cross validation score is: 0.9888888888888889
Accuracy score is: 0.7580645161290323
Cross validation score is: 0.6634920634920635
Accuracy score is: 0.6612903225806451
Cross validation score is: 0.6522222222222223
Accuracy score is: 0.9919354838709677
Cross validation score is: 0.961111111111111
Accuracy score is: 0.1935483870967742
Cross validation score is: 0.1638095238095238


In [118]:
print("Best model for wine is:",model_names[pred_scores.index(max(pred_scores))])

Best model for wine is: ExtraTrees Model
