In [None]:
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd   

from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.model_selection import GridSearchCV

from sklearn import svm
from sklearn.neural_network import MLPClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from collections import Counter
import seaborn as sns

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler


# 1. Data Preprocessing

## Importing dataset & Checking for missing data

In [None]:
headerList = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']

wineData = pd.read_csv('winequality-red.csv', header = 0, names = headerList, sep=";")
print(wineData.head(10))
print(wineData)

#Summarative functions
wineData.dtypes
wineData.describe()
wineData.info()

#Plotting histogram of each variable
wineData.hist(alpha=0.5, figsize=(15, 10))
plt.tight_layout()
plt.show()

for h in headerList:
  wineData[h] = pd.to_numeric(wineData[h], errors='coerce')

print("\nChecking for null values: \n")
wineData.isna().sum()
wineData = wineData.fillna(0)
print("\nChecking for null values after using fillna(): \n")
wineData.isna().sum()


### Our data is imbalanced as seen from the histogram. We will adapt multiple strategies to address the issue.

# 2. Exploratory Data Analysis

## 1. Principal Component Analysis (PCA)

In [None]:
#PCA
df_pca = wineData.copy()
X_pca = df_pca.loc[:, 'fixed acidity':'alcohol']
y_pca = df_pca['quality']

X_pca.tail()
X_pca = StandardScaler().fit_transform(X_pca)

#Fit PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_pca)

X_pca.shape

plt.figure(figsize=(8,6))
plt.scatter(X_pca[:,0],X_pca[:,1],c=y_pca,cmap='rainbow')
plt.xlabel('First principal component')
plt.ylabel('Second Principal Component')
plt.title("Using PCA to Visualize Classes")
plt.show()

print("components: ", pca.components_, "\n")
print("explained variance: ", pca.explained_variance_, "\n")
exp_var_rat = pca.explained_variance_ratio_
print("explained variance ratio: ", exp_var_rat)

## 2. Correlation Matrix

In [None]:


plt.figure(figsize=(12,10))
sns.heatmap(wineData.corr(),annot=True, cmap='coolwarm',fmt='.2f')

## 3. Univariate Selection

In [None]:
#Split data into training and test sets
X = wineData.loc[:, 'fixed acidity':'alcohol']
y = wineData['quality']

# apply SelectKBest class to extract best features
bestFeatures = SelectKBest(score_func=chi2, k=11)
bestFeaturesFit = bestFeatures.fit(X,y)
dfscores = pd.DataFrame(bestFeaturesFit.scores_)
dfcolumns = pd.DataFrame(X.columns) 

# concatenate scores with predictor names
predScores = pd.concat([dfcolumns,dfscores],axis=1)
predScores.columns = ['Predictor','Score']
print(predScores.nlargest(11,'Score'))   

## Dropping features from univariate selection

### We are dropping bottom four features as they have very low predictor scores and to save computation

In [None]:
#Drop the bottom four features (smallest score)
wineData = wineData.drop(['density'], axis=1)
wineData = wineData.drop(['pH'], axis=1)
wineData = wineData.drop(['chlorides'], axis=1)
wineData = wineData.drop(['residual sugar'], axis=1)
wineData

X = wineData.loc[:, 'fixed acidity':'alcohol']
y = wineData['quality']

# Addressing Imbalance in Class

## First Strategy: Oversampling minority class

In [None]:
oversample = RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(X, y)
print("Before RandomOverSampler : ", Counter(y))
print("After RandomOverSampler : ", Counter(y_over))

## Second Strategy: Undersampling majority class

In [None]:
undersample = RandomUnderSampler(sampling_strategy='majority')
X_under, y_under = undersample.fit_resample(X, y)
print("Before RandomUnderSampler : ", Counter(y))
print("After RandomUnderSampler : ", Counter(y_under))

## Third Strategy: SMOTE

In [None]:
smoteOversample = SMOTE()
X_smote, y_smote = smoteOversample.fit_resample(X, y)

#Plotting histogram of each variable
X_smote.hist(alpha=0.5, figsize=(15, 10))

plt.tight_layout()
plt.show()

y_smote.hist(alpha=0.5, figsize=(15, 10))
plt.show()

from collections import Counter
print("Before SMOTE : ", Counter(y))
print("After SMOTE : ", Counter(y_smote))

## Fourth Strategy: Data Imputation
### Filling in data from missing classes - 0, 1, 2, 9, & 10 with fraud data

In [None]:
# ['fixed acidity', 'volatile acidity', 'citric acid', 'free sulfur dioxide', 'total sulfur dioxide', 'sulphates', 'alcohol']
avgX = X.mean(axis=0)
dfImpute = pd.DataFrame([[avgX[0], avgX[1], avgX[2], avgX[3], avgX[4], avgX[5], avgX[6], 0], 
                        [avgX[0], avgX[1], avgX[2], avgX[3], avgX[4], avgX[5], avgX[6], 1],
                        [avgX[0], avgX[1], avgX[2], avgX[3], avgX[4], avgX[5], avgX[6], 2],
                        [avgX[0], avgX[1], avgX[2], avgX[3], avgX[4], avgX[5], avgX[6], 9],
                        [avgX[0], avgX[1], avgX[2], avgX[3], avgX[4], avgX[5], avgX[6], 10]],
                        columns=['fixed acidity', 'volatile acidity', 'citric acid', 'free sulfur dioxide', 'total sulfur dioxide', 'sulphates', 'alcohol', 'quality']
                        )
dfImpute

# 3. Comparing Machine Learning Models / Obtaining Baseline Accuracy

## Modeling - Final data preparations

In [None]:
# 1. Regular Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state=10) #split the data
X_train.shape, y_train.shape, X_test.shape, y_test.shape
scaledData = StandardScaler()
X_train = scaledData.fit_transform(X_train)
X_test = scaledData.transform(X_test)

# 2. Oversampled Data
X_over_train, X_over_test, y_over_train, y_over_test = train_test_split(X_over, y_over, test_size = .2, random_state=10) #split the data
X_over_train.shape, y_over_train.shape, X_over_test.shape, y_over_test.shape
X_over_train = scaledData.fit_transform(X_over_train)
X_over_test = scaledData.transform(X_over_test)

# 3. Underrsampled Data
X_under_train, X_under_test, y_under_train, y_under_test = train_test_split(X_under, y_under, test_size = .2, random_state=10) #split the data
X_under_train.shape, y_under_train.shape, X_under_test.shape, y_under_test.shape
X_under_train = scaledData.fit_transform(X_under_train)
X_under_test = scaledData.transform(X_under_test)

#4. SMOTE Data
X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote, test_size = .2, random_state=10) #split the data
X_smote_train.shape, y_smote_train.shape, X_smote_test.shape, y_smote_test.shape
X_smote_train = scaledData.fit_transform(X_smote_train)
X_smote_test = scaledData.transform(X_smote_test)

# 5. Imputed Data
X_impute = wineData.loc[:, 'fixed acidity':'alcohol']
y_impute = wineData['quality']
X_add = dfImpute.loc[:, 'fixed acidity':'alcohol']
y_add = dfImpute['quality']
X_impute_train, X_impute_test, y_impute_train, y_impute_test = train_test_split(X_impute, y_impute, test_size = .2, random_state=10) #split the data
X_impute_train = X_impute_train.append(X_add)
y_impute_train = y_impute_train.append(y_add)
X_impute_train.shape, y_impute_train.shape, X_impute_test.shape, y_impute_test.shape
X_impute_train = scaledData.fit_transform(X_impute_train)

## 1. Random Forest Classifier Modeling

In [None]:
wineRF = RandomForestClassifier(n_estimators=200, max_depth=6, random_state=417)
wineRF.fit(X_train, y_train)

y_pred = wineRF.predict(X_test)

#Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm)
sns.set(font_scale=1.2) # for label size
sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size
plt.show()

#Print classification report
clas = classification_report(y_test, y_pred)
print(clas)


# Using Imputed Data
wineRF.fit(X_impute_train, y_impute_train)

y_impute_pred = wineRF.predict(X_impute_test)

#Plot confusion matrix
cm = confusion_matrix(y_impute_test, y_impute_pred)
df_cm = pd.DataFrame(cm)
sns.set(font_scale=1.2) # for label size
sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size
plt.show()

#Print classification report
clas = classification_report(y_test, y_pred)
print(clas)

## 2. Support Vector Machine Modeling

In [None]:
wineSVM = SVC(kernel = 'rbf')
wineSVM.fit(X_train, y_train)
y_pred = wineSVM.predict(X_test)

#Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm)
sns.set(font_scale=1.2) # for label size
sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size
plt.show()

#Print classification report for rbf kernel
clas1 = classification_report(y_test, y_pred)
print("rbf kernel classification report: \n",clas1)

#Print classification report for sigmoid kernel
wineSVM = SVC(kernel = 'sigmoid')
wineSVM.fit(X_train, y_train)
y_pred = wineSVM.predict(X_test)
clas2 = classification_report(y_test, y_pred)
print("sigmoid kernel classification report: \n", clas2)

## 3. Artificial Neural Network

In [None]:
wineMLP = MLPClassifier(activation='logistic', solver='sgd')
wineMLP.fit(X_train, y_train)
y_pred = wineMLP.predict(X_test)

#Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm)
sns.set(font_scale=1.2) # for label size
sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size
plt.show()

clas = classification_report(y_test, y_pred)
print(clas)

## 4. Logistic Regression

In [None]:
wineLR = LogisticRegression(class_weight='balanced', random_state=417)
wineLR.fit(X_train, y_train)

y_pred = wineLR.predict(X_test)

#Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm)
sns.set(font_scale=1.2) # for label size
sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size
plt.show()

#Print classification report
clas = classification_report(y_test, y_pred)
print(clas)

# 4. Hyperparameter Tuning

## 1. Random Forest Classifier

In [None]:
#Create a default random forest object
wineRFC = RandomForestClassifier(random_state=417)

#create a list of parameters you want to tune
param_grid_RFC = { 
    'n_estimators': [100, 200, 400, 500],
    # , 500, 700
    'max_depth': [5,6,7],
    # , 7, 8
    # 'max_features': ['auto', 'sqrt', 'log2']
}

#fit the model using grid search
CV_rfc = GridSearchCV(estimator=wineRFC, param_grid=param_grid_RFC, cv= 10)
CV_rfc.fit(X_train, y_train)

#print the result of best hyperparameters
print(CV_rfc.best_params_)

## 2. Support Vector Machine

In [None]:
wineSVM = SVC()
param_grid_svm = {
    'kernel': ['rbf','sigmoid','poly'], 
    'C': [1, 10, 100, 1000]
}

CV_svm = GridSearchCV(estimator=wineSVM, param_grid=param_grid_svm, cv= 10)
CV_svm.fit(X_train, y_train)

#print the result of best hyperparameters
print(CV_svm.best_params_)
           

## 3. Artificial Neural Network

In [None]:
wineANN = MLPClassifier()
param_grid_ann = {
    'activation': ['logistic', 'tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'learning_rate': ['constant', 'adaptive'],
    'alpha': [0.0001, 0.001, 0.01, 0.1],
    'learning_rate_init': [0.001, 0.01, 0.1],
    'max_iter': [100, 200, 300, 400, 500],
}

CV_svm = GridSearchCV(estimator=wineANN, param_grid=param_grid_svm, cv= 10)
CV_svm.fit(X_train, y_train)

#print the result of best hyperparameters
print(CV_svm.best_params_)

# 5. Testing Final Models

## 1. Random Forest Classifier

## 2. Support Vector Machine

## 3. Artificial Neural Network

# 6. Testing best model on the test set
## 1. Random Forest Classifier

## 2. Support Vector Machine

## 3. Artificial Neural Network