In [3]:
import pandas as pd
import numpy as np
from numpy import mean
from numpy import std

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn import svm

from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec

## 1) Data Wrangling

In [4]:
df = pd.read_csv("../input/wine-quality-dataset/WineQT.csv")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,2
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,3
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,4


In [5]:
df=df.drop(['Id'], axis=1)
df['quality'] = df['quality']-3

In [6]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,2
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,2
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,2
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,3
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,2


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1143 non-null   float64
 1   volatile acidity      1143 non-null   float64
 2   citric acid           1143 non-null   float64
 3   residual sugar        1143 non-null   float64
 4   chlorides             1143 non-null   float64
 5   free sulfur dioxide   1143 non-null   float64
 6   total sulfur dioxide  1143 non-null   float64
 7   density               1143 non-null   float64
 8   pH                    1143 non-null   float64
 9   sulphates             1143 non-null   float64
 10  alcohol               1143 non-null   float64
 11  quality               1143 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 107.3 KB


In [None]:
df.shape[0]

In [None]:
df.shape[1]

In [None]:
df.isnull().sum()

In [None]:
df['quality'].nunique()

In [None]:
df['quality'].unique()

In [None]:
df['quality'].value_counts()/len(df)

In [None]:
#df.describe()[1:].T.style.background_gradient(cmap='Blues')
df.describe()[1:].T

## 2) Data Visualization

In [None]:
df_category=df.copy()
df_category=df_category.sort_values(by='quality', ascending=True)

In [None]:
df_category["Quality Category"]=df_category["quality"]
df_category["Quality Category"].replace({0: "Terrible", 1: "Very Poor", 2: "Poor", 3: "Good", 4: "Very Good", 5: "Excellent"}, inplace=True)

In [None]:
# visualize frequency distribution of quality

f,ax=plt.subplots(1,2,figsize=(22,8))

ax[0] = df_category["Quality Category"].value_counts().plot.pie(autopct='%1.1f%%', ax=ax[0], fontsize=13, colors = ['#34495E','#566573', '#5D6D7E', '#85929E', '#AEB6BF','#EBEDEF'])
ax[0].set_title("Distribution of Wine Quality in Dataset", fontsize=20)
ax[0].legend(bbox_to_anchor=(1, 1), fontsize=12)



#f, ax = plt.subplots(figsize=(6, 8))
ax[1] = sns.countplot(x="Quality Category", data=df_category, palette=['#EBEDEF','#85929E', '#34495E', '#566573', '#5D6D7E','#AEB6BF'])
ax[1].set_title("Frequency distribution of quality", fontsize=20)

for p in ax[1].patches:
    ax[1].annotate('{:.1f}%'.format(100*p.get_height()/len(df_category['Quality Category'])), (p.get_x()+0.1, p.get_height()+5))

plt.xticks(rotation=70, fontsize=12)
sns.set_context("paper", rc={"font.size":8,"axes.titlesize":20,"axes.labelsize":14})   

plt.show()

In [None]:
quality_table={}

In [None]:
quality_table['Values']=[0,1,2,3,4,5]

In [None]:
df_comparison = pd.DataFrame.from_dict(quality_table).T
df_comparison.columns = ['Terrible', 'Very Poor', 'Poor', 'Good', 'Very Good', 'Excellent']

df_comparison.style.background_gradient(cmap='Blues')

In [None]:
sns.pairplot(df)
plt.show()

In [None]:
fig = plt.figure(figsize=(16,8))

gs1 = gridspec.GridSpec(2, 6)
axs = []
for c, num in zip(df.columns, range(1,12)):
    axs.append(fig.add_subplot(gs1[num - 1]))
    axs[-1].hist(df[c])
    plt.title(c)

    
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(20,10))
corr = df.corr()
sns.heatmap(corr,annot=True,cmap='Blues')
plt.show()

In [None]:
plt.figure(figsize=(15,8))
df_corr_bar = abs(df.corr()['quality']).sort_values()[:-1]
sns.barplot(df_corr_bar.index,df_corr_bar.values,palette="Blues_d").set_title('Feature Correlation Distribution According to Quality', fontsize=20)
plt.xticks(rotation=70, fontsize=14)

In [None]:
sns.set(rc={'figure.figsize':(12,9)})


ax=sns.boxplot(x="Quality Category", y="volatile acidity", data=df_category, palette=['#34495E','#566573', '#5D6D7E', '#85929E', '#AEB6BF','#EBEDEF'])
ax.set_title('Boxplot for Volatile Acidity vs Quality', fontsize=20)
ax.set_xlabel('Quality',fontsize=14)
ax.set_ylabel('Volatile Acidity',fontsize=14)

In [None]:
sns.set(rc={'figure.figsize':(12,9)})


ax=sns.boxplot(x="Quality Category", y="alcohol", data=df_category, palette=['#34495E','#566573', '#5D6D7E', '#85929E', '#AEB6BF','#EBEDEF'])
ax.set_title('Boxplot for Alcohol vs Quality', fontsize=20)
ax.set_xlabel('Quality',fontsize=14)
ax.set_ylabel('Alcohol',fontsize=14)

In [None]:
figsize=(22,20)

ax=sns.regplot(x="fixed acidity", y="citric acid", data=df,x_bins=25)

plt.xlim(4,16)
plt.ylim(0)
ax.set_title('Correlation Between Fixed Acidity and Citric Acid', fontsize=20)
ax.set_xlabel('Fixed Acidity',fontsize=14)
ax.set_ylabel('Citric Acid',fontsize=14)

In [None]:
figsize=(22,20)

ax=sns.regplot(x="fixed acidity", y="pH", data=df,x_bins=25)


ax.set_title('Correlation Between Fixed Acidity and pH', fontsize=20)
ax.set_xlabel('Fixed Acidity',fontsize=14)
ax.set_ylabel('pH',fontsize=14)

In [None]:
average=df.groupby("quality").mean()
average.plot(kind="bar",figsize=(20,8))
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(10,8))
x = df['density']
x = pd.Series(x, name="density")
ax = sns.kdeplot(x, shade=True, color='blue')
ax.set_title("Distribution of density variable", fontsize=20)
plt.show()

In [None]:
plt.figure(figsize=(15,7))
sns.lineplot(data=df_category, x="Quality Category",y="sulphates",color="g", label="sulphates")
sns.lineplot(data=df_category, x="Quality Category",y="citric acid",color="b", label="citric acid")

plt.xlim(0,5)
plt.ylim(0)

plt.ylabel("Quantity", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.title("Feature Impact on quality", fontsize=20)
plt.legend()
plt.show()

## 3) Feature Importance (Random Forest)

In [None]:
X = df.drop(['quality'], axis=1)

y = df['quality']
# test_size=0.2 => %20 test, %80 train
# random_state=0 provides to have same results 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
X_train.shape, X_test.shape

In [None]:
forest = RandomForestClassifier(n_estimators=500,random_state=0).fit(X_train, y_train)

# Predict the Test set results
y_pred = forest.predict(X_test)

# Check accuracy score 
print('Model accuracy score with 1000 decision-trees : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

### 3.1) Visualize Prediction Score

In [None]:
y_predict = list(forest.predict(X_test))
predicted_df = {'predicted_values': y_predict, 'original_values': y_test}
#creating new dataframe
predicted_df=pd.DataFrame(predicted_df)

In [None]:
predicted_df.head()

In [None]:
predicted_df['original_values'].value_counts()


In [None]:
predicted_df['error']= ["correct" if row['original_values']==row['predicted_values'] else "incorrect" for index,row in predicted_df.iterrows()]

In [None]:
predicted_df=predicted_df.drop(['predicted_values'], axis=1)

In [None]:
color= ['#566573', '#AF6276']
test = predicted_df.groupby(['original_values', 'error'])['original_values'].count().unstack('error').fillna(0)
test.plot(kind='bar', stacked=True, color=color)
plt.title('Prediction Error Rate According to Category', fontsize=15)
plt.xticks(rotation=0, fontsize=12)
plt.ylim(0)

### 3.2) Determine Feature Importance

In [None]:
feature_scores = pd.Series(forest.feature_importances_, index=X_train.columns).sort_values(ascending=False)

feature_scores

In [None]:
# Figure Size
fig, ax = plt.subplots(figsize =(16, 9))

ax.grid(b = True, color ='grey',
        linestyle ='-.', linewidth = 0.5,
        alpha = 0.5)

colors = ['#00223b', '#012a44', '#023c5d', '#034d75', '#18658b', '#357d9f', '#5194b2', '#6fa8c0', '#8dbbce', '#abcfdc', '#bddae5']

feature_scores_perc = [per*100 for per in feature_scores]

# Horizontal Bar Plot
ax.barh(feature_scores.index, feature_scores_perc, color=colors)

ax.set_title('Feature Importance', fontsize=20)
ax.set_xlabel('Percentage', fontsize=18)
ax.tick_params(axis='x', labelsize=14)
ax.tick_params(axis='y', labelsize=14)

plt.show()

In [None]:
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)

fig, ax = plt.subplots()
feature_scores.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI", fontsize=20)
ax.set_ylabel("Mean decrease in impurity", fontsize=18)
ax.tick_params(axis='x', labelsize=14)
ax.tick_params(axis='y', labelsize=14)
fig.tight_layout()

In [None]:
plt.subplots(figsize=(15, 5))


plt.subplot(1, 2, 1)
plt.bar(df['quality'], df['alcohol'])
plt.title('Alcohol vs. Quality')
plt.xlabel('quality')
plt.ylabel('alcohol')
plt.legend()

plt.subplot(1, 2, 2)
plt.hist(df['alcohol'])
plt.title("Alcohol Histogram")
plt.show()

## 4) Yes or No Categorization

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
normal_df = scaler.fit_transform(df)
normal_df = pd.DataFrame(normal_df, columns = df.columns)
print(normal_df.head())

In [None]:
df_yn=df.copy()
df_yn["good wine"] = ["yes" if i >= 3 else "no" for i in df_yn['quality']]


In [None]:
X = df_yn.drop(["quality"], axis = 1)
y = df_yn["good wine"]

In [None]:
sns.countplot(y)
plt.show()

# 5) MODELLING

In [None]:
model_comparison={}
rf_feature_imp={}

## 5.1) NORMALIZED DATA

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
df_drop_quality=df.drop(['quality'], axis=1)
normal_df = scaler.fit_transform(df_drop_quality)
normal_df = pd.DataFrame(normal_df, columns = df_drop_quality.columns)
normal_df['quality']=df['quality']
X1 = normal_df.drop(['quality'], axis=1)
y1 = normal_df['quality']
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size = 0.2, random_state = 0)

In [None]:
cv = KFold(n_splits=10, random_state=0, shuffle=True)

In [None]:
# random_state=0, n_estimators default=100
forest = RandomForestClassifier(random_state=0).fit(X_train1, y_train1)
y_pred = forest.predict(X_test1)
print('Model accuracy score : {0:0.2f}%'. format(accuracy_score(y_test1, y_pred)*100))

scores = cross_val_score(forest, X1, y1, scoring='accuracy', cv=cv, n_jobs=-1)
print("Mean Accuracy: %.2f%%, Standard Deviation: (%.2f%%)" % (scores.mean()*100.0, scores.std()*100.0))

## 5.2) RANDOM FOREST - TUNING MODEL

In [None]:
X = df.drop(['quality'], axis=1)

y = df['quality']
# test_size=0.2 => %20 test, %80 train
# random_state=0 provides to have same results 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
cv = KFold(n_splits=10, random_state=0, shuffle=True)

In [None]:
# Finding best parameters to tune randomforestclassfier()

parameters = {'min_samples_split': [2,5], 'max_features':[1,5], 'max_depth':[14,24]}

rf = RandomForestClassifier()

print('Paramaters:', rf.get_params())

clf = GridSearchCV(rf, parameters, cv=10).fit(X_train, y_train)

print(f'Best Hyperparameters: {clf.best_params_}')

### Controlling RandomForestClassifier() Tuning 

In [None]:
# random_state=0, n_estimators default=100
forest = RandomForestClassifier(random_state=0).fit(X_train, y_train)
y_pred = forest.predict(X_test)
print('Model accuracy score : {0:0.2f}%'. format(accuracy_score(y_test, y_pred)*100))

scores = cross_val_score(forest, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print("Mean Accuracy: %.2f%%, Standard Deviation: (%.2f%%)" % (scores.mean()*100.0, scores.std()*100.0))

In [None]:
# random_state=0, n_estimators=500, max_depth=14
forest = RandomForestClassifier(n_estimators=500,max_depth=14, random_state=0).fit(X_train, y_train)
y_pred = forest.predict(X_test)
print('Model accuracy score : {0:0.2f}%'. format(accuracy_score(y_test, y_pred)*100))

scores = cross_val_score(forest, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print("Mean Accuracy: %.2f%%, Standard Deviation: (%.2f%%)" % (scores.mean()*100.0, scores.std()*100.0))

In [None]:
# random_state=0, n_estimators=500, max_depth=14, min_samples_split=3, max_features=1
forest = RandomForestClassifier(n_estimators=500,max_depth=14, random_state=0, min_samples_split=2, max_features=5).fit(X_train, y_train)
y_pred = forest.predict(X_test)
print('Model accuracy score : {0:0.2f}%'. format(accuracy_score(y_test, y_pred)*100))

scores = cross_val_score(forest, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print("Mean Accuracy: %.2f%%, Standard Deviation: (%.2f%%)" % (scores.mean()*100.0, scores.std()*100.0))

param_dist = {"max_depth": range(1,15),
              "max_features": range(1,10),
              "criterion": ["gini", "entropy"],
              "min_samples_split": range(1,5)}
              
forest_cv = GridSearchCV(forest, param_dist, cv=5)

forest_cv.fit(X_train,y_train)

print(forest_cv.best_params_)

#Result: {'criterion': 'gini', 'max_depth': 14, 'max_features': 1, 'min_samples_split': 2}

In [None]:
model_comparison['Random Forest']=[scores.mean()*100.0,scores.std()*100.0]
rf_feature_imp['Random Forest']=[scores.mean()*100.0,scores.std()*100.0]

### Random Forest Parameter Observations

**min_impurity_decrease** - no impact even though it is selected as 0.0000001

**criterion="entropy"** - decreases accuracy to 0.6681

**min_samples_leaf** - if 1 same, if higher decreases

**max_features** - does not affect result

**max_leaf_nodes** - 0.6943 when it is higher than 300

**max_samples** - not significant effect

**balanced_subsample** - decreases

**oob_score** - not effect

## 5.3) DECISION TREE - TUNING MODEL

In [None]:
# Finding best parameters to tune randomforestclassfier()

parameters = {'min_samples_split': [2,5], 'max_features':[1,5], 'max_depth':[14,24]}

rf = DecisionTreeClassifier()

print('Paramaters:', rf.get_params())

clf = GridSearchCV(rf, parameters, cv=10).fit(X_train, y_train)

print(f'Best Hyperparameters: {clf.best_params_}')

In [None]:
decision_tree=DecisionTreeClassifier(random_state=0).fit(X_train,y_train)
y_pred = decision_tree.predict(X_test)
print('Model accuracy score : {0:0.2f}%'. format(accuracy_score(y_test, y_pred)*100))

scores = cross_val_score(decision_tree, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print("Mean Accuracy: %.2f%%, Standard Deviation: (%.2f%%)" % (scores.mean()*100.0, scores.std()*100.0))

In [None]:
decision_tree=DecisionTreeClassifier(random_state=0, max_depth=14, max_features=5, min_samples_split=2).fit(X_train,y_train)
y_pred = decision_tree.predict(X_test)
print('Model accuracy score : {0:0.2f}%'. format(accuracy_score(y_test, y_pred)*100))

scores = cross_val_score(decision_tree, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print("Mean Accuracy: %.2f%%, Standard Deviation: (%.2f%%)" % (scores.mean()*100.0, scores.std()*100.0))

In [None]:
model_comparison['Decision Tree']=[scores.mean()*100.0,scores.std()*100.0]

### Decision Tree Parameter Observations

**criterion:** as entropy decreases accuracy

**splitter:** as random changes accuracy between 0.58-0.62

**max_depth:** does not increase the accuracy but decrease according to value

**max_leaf_nodes:** does not increase but decrease

**class_weight:** deacreases accuracy

**min_impurity_decrease:** decreases accuracy

**ccp_alpha:** does not increase but decrease

## 5.4) SCV - MODEL TUNING

In [None]:
svc = svm.SVC(random_state=0).fit(X_train, y_train)
print('Model accuracy score : {0:0.2f}%'. format(svc.score(X_test, y_test)*100))
scores = cross_val_score(svc, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print("Mean Accuracy: %.2f%%, Standard Deviation: (%.2f%%)" % (scores.mean()*100.0, scores.std()*100.0))

In [None]:
# linear is best option
svc = svm.SVC(kernel='linear', random_state=42).fit(X_train, y_train)
print('Model accuracy score : {0:0.2f}%'. format(svc.score(X_test, y_test)*100))
cv = KFold(n_splits=10, random_state=0, shuffle=True)
scores = cross_val_score(svc, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print("Mean Accuracy: %.2f%%, Standard Deviation: (%.2f%%)" % (scores.mean()*100.0, scores.std()*100.0))

In [None]:
model_comparison['SVC']=[scores.mean()*100.0,scores.std()*100.0]

## 5.5) MODEL COMPARIOSN

In [None]:
df_comparison = pd.DataFrame.from_dict(model_comparison).T
df_comparison.columns = ['Mean Accuracy', 'Standard Deviation']
df_comparison = df_comparison.sort_values('Mean Accuracy', ascending=True)
df_comparison.style.background_gradient(cmap='Blues')

# 6) INCREASING ACCURACY - DROP LEAST IMPORTANT FEATURES

In [None]:
cv = KFold(n_splits=10, random_state=0, shuffle=True)

## 6.1) DROPPING LEAST IMPORTANT FEATURES

In [None]:
X_train1 = X_train.drop(['residual sugar'],axis=1)
X_test1 = X_test.drop(['residual sugar'],axis=1)
tree=RandomForestClassifier(n_estimators=500,max_depth=14, random_state=0, min_samples_split=2, max_features=1).fit(X_train1,y_train)
scores = cross_val_score(tree, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print("Mean Accuracy: %.2f%%, Standard Deviation: (%.2f%%)" % (scores.mean()*100.0, scores.std()*100.0))

In [None]:
X_train1 = X_train1.drop(['free sulfur dioxide'],axis=1)
X_test1 = X_test1.drop(['free sulfur dioxide'],axis=1)
tree=RandomForestClassifier(n_estimators=500,max_depth=14, random_state=0, min_samples_split=2, max_features=1).fit(X_train1,y_train)
scores = cross_val_score(tree, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print("Mean Accuracy: %.2f%%, Standard Deviation: (%.2f%%)" % (scores.mean()*100.0, scores.std()*100.0))

## 6.2) PIPELINE

In [None]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(RandomForestClassifier(n_estimators=500,max_depth=14, random_state=0, min_samples_split=2, max_features=1),n_estimators=50,learning_rate=1.0, algorithm='SAMME')
clf.fit(X_train, y_train)
cv = KFold(n_splits=10, random_state=1, shuffle=True)
scores = cross_val_score(clf, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print("Mean Accuracy: %.3f%%, Standard Deviation: (%.3f%%)" % (scores.mean()*100.0, scores.std()*100.0))

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

pipeline = make_pipeline(StandardScaler(),RandomForestClassifier(n_estimators=500,max_depth=14, random_state=0, min_samples_split=2, max_features=1))

pipeline.fit(X_train, y_train)
cv = KFold(n_splits=10, random_state=1, shuffle=True)
scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print("Mean Accuracy: %.3f%%, Standard Deviation: (%.3f%%)" % (scores.mean()*100.0, scores.std()*100.0))

## 6.3) ADDIG NEW DATASET TO INCREASE DATASET SIZE

In [None]:
df_new= pd.read_csv("../input/red-wine-dataset/wineQualityReds.csv")
df_new=df_new.drop(['Unnamed: 0'], axis=1)
df_new['quality'] = df_new['quality']-3
df_new.rename(columns = {'fixed.acidity':'fixed acidity', 'volatile.acidity':'volatile acidity', 'citric.acid':'citric acid', 'residual.sugar':'residual sugar', 'free.sulfur.dioxide':'free sulfur dioxide', 'total.sulfur.dioxide':'total sulfur dioxide'}, inplace = True)
df_new.head()

In [None]:
df_combine = pd.concat([df,df_new])
df_combine.head()

In [None]:
df_combine.info()

In [None]:
df_combine_category=df_combine.copy()
df_combine_category["Quality Category"]=df_combine_category["quality"]
df_combine_category["Quality Category"].replace({0: "Terrible", 1: "Very Poor", 2: "Poor", 3: "Good", 4: "Very Good", 5: "Excellent"}, inplace=True)


In [None]:
f,ax=plt.subplots(1,2,figsize=(22,8))

ax[0] = df_combine_category["Quality Category"].value_counts().plot.pie(autopct='%1.1f%%', ax=ax[0], fontsize=13, colors = ['#b5c6e0','#becee4', '#c7d5e7', '#d0ddeb', '#d9e5ee','#ebf4f5'])
ax[0].set_title("Distribution of Wine Quality in Larger Dataset", fontsize=20)
ax[0].legend(bbox_to_anchor=(1, 1), fontsize=12)



#f, ax = plt.subplots(figsize=(6, 8))
ax[1] = sns.countplot(x="Quality Category", data=df_combine_category, order = df_combine_category['Quality Category'].value_counts().index, palette=['#b5c6e0','#becee4', '#c7d5e7', '#d0ddeb', '#d9e5ee','#ebf4f5'])
ax[1].set_title("Frequency distribution of quality", fontsize=20)

for p in ax[1].patches:
    ax[1].annotate('{:.1f}%'.format(100*p.get_height()/len(df_combine_category['Quality Category'])), (p.get_x()+0.1, p.get_height()+5))

plt.xticks(rotation=70,fontsize=12)
sns.set_context("paper", rc={"font.size":8,"axes.titlesize":20,"axes.labelsize":14})   

plt.show()

In [None]:
X = df_combine.drop(['quality'], axis=1)
y = df_combine['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
forest = RandomForestClassifier(n_estimators=500,max_depth=14, random_state=0, min_samples_split=2, max_features=1)

In [None]:
forest.fit(X_train, y_train)

y_pred = forest.predict(X_test)

print('Model accuracy score with 1000 decision-trees : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

cv = KFold(n_splits=10, random_state=0, shuffle=True)
scores = cross_val_score(forest, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print("Mean Accuracy: %.3f%%, Standard Deviation: (%.3f%%)" % (scores.mean()*100.0, scores.std()*100.0))