In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

In [2]:
data = pd.read_csv('WineQT.csv')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,2
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,3
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,4


### Data Splitting

In [3]:
# inut features
X = data.drop('quality',axis=1)
# output / label
Y = data['quality']

In [4]:
X.shape


(1143, 12)

In [5]:
Y.shape


(1143,)

### Break data into train and test set


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [7]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(914, 12) (229, 12) (914,) (229,)


### Feature Scaling


In [8]:
from sklearn.preprocessing import StandardScaler,MaxAbsScaler,MinMaxScaler

scalar = StandardScaler()

X_train = scalar.fit_transform(X_train)

X_test = scalar.transform(X_test) # we have computed U = 0 and sigma=1 

### Data Modeling


In [9]:
from sklearn.tree import DecisionTreeClassifier
DTmodel = DecisionTreeClassifier() # by default it uses GINI index

DTmodel.fit(X_train, y_train)

### Prediction

In [10]:
DTmodel.predict(X_test)

array([4, 6, 6, 5, 7, 7, 5, 5, 5, 5, 7, 6, 6, 6, 6, 6, 6, 4, 5, 7, 7, 5,
       5, 6, 4, 5, 7, 7, 5, 6, 5, 6, 8, 5, 5, 5, 5, 6, 6, 6, 7, 6, 6, 6,
       5, 5, 5, 7, 5, 7, 6, 5, 5, 7, 6, 6, 5, 7, 7, 5, 5, 6, 7, 7, 5, 5,
       8, 5, 5, 5, 5, 6, 6, 6, 6, 5, 5, 6, 5, 6, 5, 6, 6, 5, 5, 6, 5, 6,
       5, 6, 5, 6, 5, 5, 5, 5, 5, 6, 6, 5, 6, 7, 5, 6, 5, 5, 6, 6, 6, 7,
       7, 5, 5, 5, 5, 6, 5, 8, 5, 5, 5, 4, 5, 6, 5, 6, 3, 6, 5, 6, 5, 5,
       7, 6, 5, 4, 4, 6, 5, 6, 7, 5, 5, 5, 7, 5, 5, 5, 5, 5, 7, 5, 6, 4,
       6, 5, 5, 5, 7, 6, 5, 6, 4, 6, 7, 5, 6, 6, 6, 5, 6, 6, 6, 6, 5, 5,
       7, 6, 5, 5, 6, 5, 7, 6, 5, 5, 6, 3, 6, 6, 6, 6, 7, 5, 6, 6, 4, 6,
       7, 6, 5, 6, 5, 6, 7, 7, 5, 5, 6, 8, 5, 7, 5, 6, 5, 6, 6, 5, 6, 5,
       7, 6, 5, 5, 4, 7, 6, 6, 6], dtype=int64)

### Model Evalution

In [11]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, DTmodel.predict(X_test)))

[[ 0  0  0  0  0  0]
 [ 0  1  2  2  1  0]
 [ 1  7 63 24  1  0]
 [ 1  2 32 48 13  3]
 [ 0  0  0  9 17  0]
 [ 0  0  0  0  1  1]]


In [12]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test,DTmodel.predict(X_test))

0.5676855895196506

In [13]:
from sklearn.metrics import mean_absolute_error , mean_squared_error
mse = mean_squared_error(y_test,DTmodel.predict(X_test))
mae = mean_absolute_error(y_test,DTmodel.predict(X_test))
print(mse,mae)



0.6200873362445415 0.4890829694323144


### MOdel Training using RandomForests

In [14]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

model.fit(X_train, y_train)

In [15]:
model.predict(X_test)

array([5, 6, 5, 5, 7, 6, 5, 5, 5, 5, 7, 6, 6, 6, 5, 5, 6, 5, 5, 7, 6, 5,
       5, 7, 5, 5, 7, 6, 5, 6, 6, 5, 7, 6, 5, 5, 6, 6, 6, 5, 6, 6, 6, 5,
       5, 5, 5, 6, 5, 6, 7, 5, 6, 7, 6, 6, 6, 6, 6, 5, 5, 6, 6, 6, 6, 6,
       6, 5, 5, 5, 6, 5, 5, 6, 5, 5, 5, 6, 5, 6, 5, 6, 6, 5, 5, 6, 5, 5,
       5, 6, 5, 6, 5, 5, 5, 5, 5, 6, 5, 5, 6, 6, 5, 6, 5, 5, 6, 6, 6, 7,
       7, 5, 5, 6, 5, 6, 6, 6, 6, 5, 5, 5, 5, 6, 5, 6, 6, 6, 5, 6, 5, 5,
       7, 6, 7, 5, 5, 5, 5, 6, 7, 5, 5, 5, 6, 5, 5, 5, 5, 5, 6, 5, 6, 5,
       6, 5, 5, 5, 7, 6, 5, 5, 6, 6, 7, 5, 6, 6, 6, 5, 6, 6, 6, 5, 5, 5,
       6, 6, 6, 5, 6, 5, 6, 5, 5, 5, 6, 5, 6, 6, 6, 6, 7, 5, 5, 6, 5, 6,
       7, 6, 6, 6, 5, 6, 7, 7, 5, 5, 6, 6, 5, 6, 6, 6, 5, 6, 6, 5, 7, 6,
       6, 5, 5, 5, 5, 6, 6, 6, 6], dtype=int64)

In [16]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, model.predict(X_test)))

[[ 0  3  3  0  0]
 [ 0 77 18  1  0]
 [ 0 28 67  4  0]
 [ 0  0 11 15  0]
 [ 0  0  2  0  0]]


In [17]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test,model.predict(X_test))

0.6943231441048034

In [18]:
from sklearn.metrics import mean_absolute_error , mean_squared_error
mse = mean_squared_error(y_test,model.predict(X_test))
mae = mean_absolute_error(y_test,model.predict(X_test))
print(mse,mae)



0.38427947598253276 0.3318777292576419


### important :- n_estimators = 100 by default 

In [19]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=50) # n_estimators = are the number of Decision tree use in the RandomForests 

model.fit(X_train, y_train)
from sklearn.metrics import accuracy_score

accuracy_score(y_test,model.predict(X_test))

0.7161572052401747

## Hyperparameter Tuning For RandomForest

In [20]:
param_grid = {
    'n_estimators': [50, 100, 200, 500],  # Number of trees
    'max_depth': [None, 10, 20, 30],      # Max depth of trees
    'min_samples_split': [2, 5, 10],      # Minimum number of samples to split a node
    'min_samples_leaf': [1, 2, 4],        # Minimum number of samples per leaf
    'bootstrap': [True, False]            # Whether to use bootstrap samples
}


In [23]:
from sklearn.model_selection import  GridSearchCV, RandomizedSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


In [24]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': None,
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 500}

In [25]:
best_rf = RandomForestClassifier(**grid_search.best_params_, random_state=42)
best_rf.fit(X_train, y_train)

# Evaluate on test set
accuracy = best_rf.score(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.6943
