## Import Basic Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
sb.set()

## Import Dataset 

In [None]:
trainData = pd.read_csv('winequality-red.csv')
trainData.head()

In [None]:
trainData.describe()

In [None]:
list(trainData) 

In [None]:
trainData.shape #data size

In [None]:
trainData.isnull().sum() #check for missing values 

## Data for Max Quality

In [None]:
i = 0
count = 0
while i < 1599:   #loop to count number of wine with max quality
    if trainData['quality'][i]==8:
        count+=1
    i+=1
print(count)

In [None]:
maxQuality = trainData[(trainData['quality']==8)] #data for wine with max quality
maxQuality

In [None]:
maxQuality.describe() #statistics for factors for max quality

In [None]:
f, axes = plt.subplots(12, figsize=(10, 50))  #visualisation to see which value of factors produce max quality

count = 0
for var in trainData:
    sb.histplot(data = maxQuality[var], ax = axes[count])
    count += 1

## Data Visualisation and Correlation

In [None]:
f = plt.figure(figsize=(16, 8))
sb.boxplot(data = trainData, orient = "h")

In [None]:
f, axes = plt.subplots(12, 3, figsize=(30, 50))

count = 0
for var in trainData:
    sb.boxplot(data = trainData[var], orient = "h", ax = axes[count,0])
    sb.histplot(data = trainData[var], ax = axes[count,1])
    sb.violinplot(data = trainData[var], orient = "h", ax = axes[count,2])
    count += 1
    

In [None]:
f = plt.figure(figsize=(12, 12))
sb.heatmap(trainData.corr(), vmin = -1, vmax = 1, annot = True, fmt = ".2f")

In [None]:
trainData[['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']].corr()['quality'][:]   #list of corr of factors against quality

In [None]:
x = ['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

for i in x:  #range of +-0.20 to choose factors 
    if trainData[i].corr(trainData['quality']) > 0.20 or trainData[i].corr(trainData['quality']) < -0.20:
        print(i)

In [None]:
finalData = trainData[['volatile acidity', 'citric acid', 'sulphates', 'alcohol', 'quality']] #factors chosen for final data
finalData

In [None]:
sb.pairplot(data = finalData) #visualisation of final data 

### 4.3 Logistic Regression

In [None]:
# Import essential models and functions from sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Extract Response and Predictors
y = pd.DataFrame(finalData["quality"])
X = pd.DataFrame(finalData[["volatile acidity", "citric acid", "sulphates","alcohol"]])

# Split the Dataset into Train and Test
# 25% of data is used for testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

# Check the sample sizes
print("Predictor Train Set :", X_train.shape, "Predictor Test Set :", X_test.shape)
print("Response Train Set :", y_train.shape, "Response Test Set :", y_test.shape)
print()

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

# Logistic Regression using Train Data
logreg = LogisticRegression()         # create the Logistic regression object
logreg.fit(X_train, y_train.values.ravel())        # train the Logistic regression model

#### Accuracy 

In [None]:
#Predicting cross validation score 
#cv_lr = cross_val_score(estimator = Lr, X = X_train, y = y_train, cv = 10)
#print("CV: ", cv_lr.mean())

y_train_pred = logreg.predict(X_train)
accuracy_train = accuracy_score(y_train,y_train_pred)
print("Training set: ", accuracy_train)

y_test_pred = logreg.predict(X_test)
accuracy_test = accuracy_score(y_test,y_test_pred)
print("Test set: ", accuracy_test)

### 4.4 K-Nearest Neighbour

In [None]:
# Import essential models and functions from sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


# Extract Response and Predictors
y = pd.DataFrame(finalData["quality"])
X = pd.DataFrame(finalData[["volatile acidity", "citric acid", "sulphates","alcohol"]])

# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

# Check the sample sizes
print("Predictor Train Set :", X_train.shape, "Predictor Test Set :", X_test.shape)
print("Response Train Set :", y_train.shape, "Response Test Set :", y_test.shape)
print()

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train,y_train.values.ravel())

#### Accuracy 

In [None]:
#Predicting cross validation score 
#cv_knn = cross_val_score(estimator = knn, X = X_train, y = y_train, cv = 10)
#print("CV: ", cv_knn.mean())

y_train_pred = knn.predict(X_train)
accuracy_train = accuracy_score(y_train,y_train_pred)
print("Training set: ", accuracy_train)

y_test_pred = knn.predict(X_test)
accuracy_test = accuracy_score(y_test,y_test_pred)
print("Test set: ", accuracy_test)

In [None]:
finalData['quality'].dtypes

In [None]:
finalData['quality']=finalData['quality'].astype('Int64')
print(finalData['quality'])

In [None]:
finalData['quality'] = pd.cut(finalData['quality'],bins=[0,6.5,8],labels=['poor','good'])

In [None]:
finalData['quality'].value_counts().sort_index()

In [None]:
x = finalData.drop('quality', axis = 1)
y = finalData['quality']
y=y.astype('object')

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25)
dectree = DecisionTreeClassifier(max_depth = 2)
dectree.fit(x_train,y_train)

In [None]:
from sklearn.tree import plot_tree
fig, ax = plt.subplots(figsize=(12, 12))
out = plot_tree(dectree, 
          feature_names = x_train.columns,
          class_names = [str(x) for x in dectree.classes_],
          filled=True)

In [None]:
# Decision Tree using Train Data
dectree = DecisionTreeClassifier(max_depth = 3)  # create the decision tree object
dectree.fit(x_train, y_train)                    # train the decision tree model

# Predict Response corresponding to Predictors
y_train_pred = dectree.predict(x_train)
y_test_pred = dectree.predict(x_test)

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Classification Accuracy \t:", dectree.score(x_train, y_train))
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Classification Accuracy \t:", dectree.score(x_test, y_test))
print()

# Plot the Confusion Matrix for Train and Test
f, axes = plt.subplots(1, 2, figsize=(12, 4))
sb.heatmap(confusion_matrix(y_train, y_train_pred),
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[0])
sb.heatmap(confusion_matrix(y_test, y_test_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[1])

In [None]:
from sklearn.ensemble import RandomForestClassifier
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25)

In [None]:
rfc = RandomForestClassifier(n_estimators = 200)
rfc.fit(x_train,y_train)

In [None]:
# Decision Tree using Train Data
dectree = DecisionTreeClassifier(max_depth = 3)  # create the decision tree object
dectree.fit(x_train, y_train)                    # train the decision tree model

# Predict Response corresponding to Predictors
y_train_pred = dectree.predict(x_train)
y_test_pred = dectree.predict(x_test)

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Classification Accuracy \t:", dectree.score(x_train, y_train))
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Classification Accuracy \t:", dectree.score(x_test, y_test))
print()

# Plot the Confusion Matrix for Train and Test
f, axes = plt.subplots(1, 2, figsize=(12, 4))
sb.heatmap(confusion_matrix(y_train, y_train_pred),
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[0])
sb.heatmap(confusion_matrix(y_test, y_test_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[1])

In [None]:
# Import essential models and functions from sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Extract Response and Predictors
y = pd.DataFrame(finalData["quality"])
X = pd.DataFrame(finalData[["volatile acidity", "citric acid", "sulphates","alcohol"]])

# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

# Check the sample sizes
print("Predictor Train Set :", X_train.shape, "Predictor Test Set :", X_test.shape)
print("Response Train Set :", y_train.shape, "Response Test Set :", y_test.shape)
print()

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

# Logistic Regression using Train Data
logreg = LogisticRegression()         # create the Logistic regression object
logreg.fit(X_train, y_train.values.ravel())        # train the Logistic regression model

In [None]:
#Predicting cross validation score 
#cv_lr = cross_val_score(estimator = Lr, X = X_train, y = y_train, cv = 10)
#print("CV: ", cv_lr.mean())

y_train_pred = logreg.predict(X_train)
accuracy_train = accuracy_score(y_train,y_train_pred)
print("Training set: ", accuracy_train)

y_test_pred = logreg.predict(X_test)
accuracy_test = accuracy_score(y_test,y_test_pred)
print("Test set: ", accuracy_test)

In [None]:
# Import essential models and functions from sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


# Extract Response and Predictors
y = pd.DataFrame(finalData["quality"])
X = pd.DataFrame(finalData[["volatile acidity", "citric acid", "sulphates","alcohol"]])

# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

# Check the sample sizes
print("Predictor Train Set :", X_train.shape, "Predictor Test Set :", X_test.shape)
print("Response Train Set :", y_train.shape, "Response Test Set :", y_test.shape)
print()

knn = KNeighborsClassifier()
knn.fit(X_train,y_train.values.ravel())



In [None]:
#Predicting cross validation score 
#cv_knn = cross_val_score(estimator = knn, X = X_train, y = y_train, cv = 10)
#print("CV: ", cv_knn.mean())

y_train_pred = knn.predict(X_train)
accuracy_train = accuracy_score(y_train,y_train_pred)
print("Training set: ", accuracy_train)

y_test_pred = knn.predict(X_test)
accuracy_test = accuracy_score(y_test,y_test_pred)
print("Test set: ", accuracy_test)

# Plot the Confusion Matrix for Train and Test
f, axes = plt.subplots(1, 2, figsize=(12, 4))
sb.heatmap(confusion_matrix(y_train, y_train_pred),
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[0])
sb.heatmap(confusion_matrix(y_test, y_test_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[1])

#### New Knowledge

In [None]:
#new knowledge using SVC
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
svc = SVC()

In [None]:
finalData.columns[finalData.isna().any()]
svc.fit(X_train,y_train)

In [None]:
y_train_pred = svc.predict(X_train)
accuracy_train = accuracy_score(y_train,y_train_pred)
print("Training set: ", accuracy_train)

y_test_pred = svc.predict(X_test)
accuracy_test = accuracy_score(y_test,y_test_pred)
print("Test set: ", accuracy_test)

# Plot the Confusion Matrix for Train and Test
f, axes = plt.subplots(1, 2, figsize=(12, 4))
sb.heatmap(confusion_matrix(y_train, y_train_pred),
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[0])
sb.heatmap(confusion_matrix(y_test, y_test_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[1])

In [None]:
from sklearn.naive_bayes import GaussianNB
naive = GaussianNB()

In [None]:
naive.fit(X_test, y_test)

In [None]:
y_train_pred = naive.predict(X_train)
accuracy_train = accuracy_score(y_train,y_train_pred)
print("Training set: ", accuracy_train)

y_test_pred = svc.predict(X_test)
accuracy_test = accuracy_score(y_test,y_test_pred)
print("Test set: ", accuracy_test)

# Plot the Confusion Matrix for Train and Test
f, axes = plt.subplots(1, 2, figsize=(12, 4))
sb.heatmap(confusion_matrix(y_train, y_train_pred),
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[0])
sb.heatmap(confusion_matrix(y_test, y_test_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[1])