## Problem Statement:
The objective of the dataset is to predict the quality of wine, whether it is a good quality of wine or ordinary quality 

The data set contains several parameters which are considered important while determining the quality of a wine

The dataset consists of several predictor variables and one target variable, Quality. The target variable has values ranging from 0 to 10, the value is the rating given to each wine on a 10 point scale

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
pd.set_option('display.float_format', lambda x: '%.2f' % x)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
wine = pd.read_csv('/content/drive/MyDrive/0.MKCE/5.Random Forest/3 Take-Home Assignment/wine.csv')

In [None]:
wine.head(10)

In [None]:
wine.dtypes

In [None]:
wine.shape

We can see here that the data set contains 1599 observations and 12 attributes

In [None]:

sns.set()
fig = plt.figure(figsize = [15, 15])

cols = ['fixed acidity', 'volatile acidity', 'citric acid', 
        'residual sugar', 'chlorides', 'free sulfur dioxide', 
        'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']
count = 1

# Generating the plots
for col in cols :
    plt.subplot(4,3,count)
    sns.distplot(wine[col],hist_kws = dict(edgecolor = "k", linewidth = 1,color = 'grey'), color = 'red')
    count+=1
plt.show() 

Here we can see the distribution of all the attributes, including the target variable

In [None]:
plt.figure(figsize = (8, 8))
sns.heatmap(wine.corr(), annot=True, linewidths=0.05, fmt= '.2f',cmap="magma")
plt.show()

In [None]:
sns.set_style("whitegrid")
fig = plt.figure(figsize = [15, 15])

cols = ['fixed acidity', 'volatile acidity', 'citric acid', 
        'residual sugar', 'chlorides', 'free sulfur dioxide', 
        'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
count = 1

for col in cols :
    plt.subplot(4,3,count)
    sns.barplot(data = wine, x = 'quality', y = col)
    count+=1
plt.show()  

In [None]:
sns.pairplot(wine, hue = 'quality')
plt.show()

In [None]:
sns.countplot(data = wine, x = 'quality')
plt.show()

In [None]:
wine['quality'] = wine.quality.apply(lambda x : 1 if x > 6.5 else 0)

In [None]:
sns.countplot(data = wine, x = 'quality')
plt.show()

In [None]:
x = wine.drop('quality',1)
y = wine['quality']

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8, random_state = 100)

In [None]:
clf_rf = RandomForestClassifier(n_estimators = 200, criterion = 'entropy', 
                                min_samples_split = 10, min_samples_leaf = 9, max_features = "auto",
                                random_state = 500, max_depth = 12) 

In [None]:
clf_fit = clf_rf.fit(x_train, y_train) 
y_pred = clf_fit.predict(x_test) 

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
y_proba = clf_fit.predict_proba(x_test)

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
y_predicted = y_proba[:,1]
print(y_predicted)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test,y_predicted)
from sklearn.metrics import auc
roc_auc = auc(fpr,tpr)
print("Area under the curve for first model",roc_auc)

In [None]:
plt.figure()
plt.plot(fpr, tpr, color = 'orange', lw = 2, label = 'ROC curve (area under curve =%0.2f)'%roc_auc)

plt.plot([0,1],[0,1], color = 'darkgrey',lw = 2,linestyle='--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.xlabel('False Positive Rate (1-Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.title('ROC Curve for first model')
plt.legend(loc = "upper left")
plt.show()

In [None]:
clf_rf2 = RandomForestClassifier(n_estimators = 100, criterion = 'gini', 
                                min_samples_split = 2, min_samples_leaf = 5, max_features = "auto",
                                random_state = 100, max_depth = 3) 

In [None]:
clf_fit_2 = clf_rf2.fit(x_train, y_train) 
y_pred_2 = clf_fit_2.predict(x_test) 

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred_2)

In [None]:
y_proba_2 = clf_fit.predict_proba(x_test)

In [None]:
y_predicted_2 = y_proba_2[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_predicted_2)
from sklearn.metrics import auc
roc_auc = auc(fpr,tpr)
print("Area under the curve for the second model",roc_auc)