**Importing Necessary Libraries**

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

**Importing dataset and examining it**

In [2]:
dataset = pd.read_csv("Beverage.csv")
pd.set_option('display.max_columns', None) # to make sure you can see all the columns in output window
print(dataset.head())
print(dataset.shape)
print(dataset.info())
print(dataset.describe())

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.0              0.27         0.36            20.7      0.045   
1            6.3              0.30         0.34             1.6      0.049   
2            8.1              0.28         0.40             6.9      0.050   
3            7.2              0.23         0.32             8.5      0.058   
4            7.2              0.23         0.32             8.5      0.058   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 45.0                 170.0   1.0010  3.00       0.45   
1                 14.0                 132.0   0.9940  3.30       0.49   
2                 30.0                  97.0   0.9951  3.26       0.44   
3                 47.0                 186.0   0.9956  3.19       0.40   
4                 47.0                 186.0   0.9956  3.19       0.40   

   alcohol quality  
0      8.8  Normal  
1      9.5  Normal  
2     10.1  Normal  
3 

**Converting Categorical features into Numerical features**

In [3]:
def converter(column):
    if column == 'Excellent':
        return 1
    else:
        return 0

dataset['quality'] = dataset['quality'].apply(converter)

**Dividing dataset into label and feature sets**

In [4]:
X = dataset.drop('quality', axis = 1) # Features
Y = dataset['quality'] # Labels
print(type(X))
print(type(Y))
print(X.shape)
print(Y.shape)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
(4898, 11)
(4898,)


**Normalizing numerical features so that each feature has mean 0 and variance 1**

In [5]:
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(X)

**Implementing Random Forest Classifier**

In [6]:
# Tuning the random forest parameter 'n_estimators' and implementing cross-validation using Grid Search
model = Pipeline([
        ('balancing', SMOTE(random_state = 101)),
        ('classification', RandomForestClassifier(criterion='entropy', max_features='auto', random_state=1) )
    ])
grid_param = {'classification__n_estimators': [100, 150, 200, 250, 300]}

gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring='precision', cv=5)

gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print(best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print(best_result)

{'classification__n_estimators': 100}
0.5405981710772059


In [7]:
# Building random forest using the tuned parameter
rfc = RandomForestClassifier(n_estimators=100, criterion='entropy', max_features='auto', random_state=1)
rfc.fit(X_scaled,Y)
featimp = pd.Series(rfc.feature_importances_, index=list(X)).sort_values(ascending=False)
print(featimp)

alcohol                 0.145159
density                 0.122280
volatile acidity        0.090512
residual sugar          0.088370
chlorides               0.087347
pH                      0.084545
free sulfur dioxide     0.084179
total sulfur dioxide    0.084078
citric acid             0.075975
sulphates               0.072697
fixed acidity           0.064858
dtype: float64


In [8]:
# Selecting features with higher sifnificance and redefining feature set
X_ = dataset[['alcohol', 'density', 'volatile acidity']]

feature_scaler = StandardScaler()
X_scaled_ = feature_scaler.fit_transform(X_)

In [9]:
#Tuning the random forest parameter 'n_estimators' and implementing cross-validation using Grid Search
model = Pipeline([
        ('balancing', SMOTE(random_state = 101)),
        ('classification', RandomForestClassifier(criterion='entropy', max_features='auto', random_state=1) )
    ])
grid_param = {'classification__n_estimators': [10, 20, 30, 40, 50]}

gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring='precision', cv=5)

gd_sr.fit(X_scaled_, Y)

best_parameters = gd_sr.best_params_
print(best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print(best_result)


{'classification__n_estimators': 40}
0.4236757242292713


**Implementing PCA to create principal components**

In [10]:
pca = PCA(n_components = 6)
pca.fit(X_scaled)
x_pca = pca.transform(X_scaled)
print("Variance explained by each of the n_components: ",pca.explained_variance_ratio_)
print("Total variance explained by the n_components: ",sum(pca.explained_variance_ratio_))

Variance explained by each of the n_components:  [0.29293217 0.14320363 0.11106103 0.09259294 0.08848496 0.08534014]
Total variance explained by the n_components:  0.813614873394286


**Creating Random Forest Classifier using principal components**

In [11]:
# Tuning the random forest parameter 'n_estimators' and implementing cross-validation using Grid Search
model = Pipeline([
        ('balancing', SMOTE(random_state = 101)),
        ('classification', RandomForestClassifier(criterion='entropy', max_features='auto', random_state=1) )
    ])
grid_param = {'classification__n_estimators': [100, 150, 200, 250, 300]}

gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring='precision', cv=5)

gd_sr.fit(x_pca, Y)

best_parameters = gd_sr.best_params_
print(best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print(best_result)


{'classification__n_estimators': 150}
0.4808906922331871
