In [1]:
# !pip install xgboost
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/aishwaryamate/Datasets/main/Wine.csv')
df

Unnamed: 0,Alcohol,Malic_Acid,Ash,Ash_Alcanity,Magnesium,Total_Phenols,Flavanoids,Nonflavanoid_Phenols,Proanthocyanins,Color_Intensity,Hue,OD280,Proline,Customer_Segment
0,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050,1
2,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185,1
3,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480,1
4,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740,3
174,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750,3
175,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835,3
176,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840,3


## Model Building

In [3]:
x = df.iloc[:,:-1]
y = df['Customer_Segment']

In [4]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3, random_state=42)

In [5]:
dt = DecisionTreeClassifier()
dt.fit(xtrain, ytrain)
ypred = dt.predict(xtest)
print(f"Train: {dt.score(xtrain,ytrain)}")
print(f"Test: {dt.score(xtest,ytest)}")
print(classification_report(ytest, ypred))

Train: 1.0
Test: 0.9629629629629629
              precision    recall  f1-score   support

           1       1.00      0.95      0.97        19
           2       0.91      1.00      0.95        21
           3       1.00      0.93      0.96        14

    accuracy                           0.96        54
   macro avg       0.97      0.96      0.96        54
weighted avg       0.97      0.96      0.96        54



- `dt.score(xtrain, ytrain)` : This calculates the accuracy score of the trained decision tree classifier (dt) on the training dataset (xtrain, ytrain). It uses the model to predict the output based on the features in xtrain and compares these predictions to the actual labels in ytrain. The score() method returns the accuracy of the model on the training data.

- `dt.score(xtest, ytest)` : Similarly, this calculates the accuracy score of the trained decision tree classifier (dt) on the testing dataset (xtest, ytest). It uses the model to predict the output based on the features in xtest and compares these predictions to the actual labels in ytest. The score() method returns the accuracy of the model on the testing data.

## Automating Model building

In [6]:
def prediction(model):
    model.fit(xtrain, ytrain)
    ypred = model.predict(xtest)
    print(f"Train Accuracy: {model.score(xtrain,ytrain)}")
    print(f"Test Accuracy: {model.score(xtest,ytest)}")
    print(classification_report(ytest, ypred))

In [7]:
prediction(DecisionTreeClassifier())

Train Accuracy: 1.0
Test Accuracy: 0.9444444444444444
              precision    recall  f1-score   support

           1       0.94      0.89      0.92        19
           2       0.91      1.00      0.95        21
           3       1.00      0.93      0.96        14

    accuracy                           0.94        54
   macro avg       0.95      0.94      0.95        54
weighted avg       0.95      0.94      0.94        54



In [8]:
prediction(LogisticRegression())

Train Accuracy: 0.9596774193548387
Test Accuracy: 0.9814814814814815
              precision    recall  f1-score   support

           1       1.00      0.95      0.97        19
           2       0.95      1.00      0.98        21
           3       1.00      1.00      1.00        14

    accuracy                           0.98        54
   macro avg       0.98      0.98      0.98        54
weighted avg       0.98      0.98      0.98        54



In [9]:
prediction(GradientBoostingClassifier())

Train Accuracy: 1.0
Test Accuracy: 0.9074074074074074
              precision    recall  f1-score   support

           1       0.86      1.00      0.93        19
           2       0.90      0.90      0.90        21
           3       1.00      0.79      0.88        14

    accuracy                           0.91        54
   macro avg       0.92      0.90      0.90        54
weighted avg       0.91      0.91      0.91        54



## Fine tuning Model:

In [10]:
params = {
    'n_estimators' : [90,110,150],
    'learning_rate' : [0.5,0.6,0.7,0.9,1],
    'max_depth' : [3,5,7,11]
}

`params` dictionary contains different hyperparameters and their corresponding values for a gradient boosting algorithm, Gradient Boosting Machines (GBM). These hyperparameters play a crucial role in influencing the performance and behavior of the gradient boosting model.

Here's an explanation of the parameters included in the `params` dictionary:

- `n_estimators`: This parameter determines the number of boosting stages (trees) that will be used in the ensemble. It represents the number of sequential trees to be built. In the context of boosting algorithms like AdaBoost or Gradient Boosting Machines, more estimators generally allow the model to learn more complex relationships in the data. The values provided here—90, 110, and 150—represent the different choices for the number of estimators to be considered.

- `learning_rate`: This hyperparameter controls the contribution of each tree in the sequence to the model. It scales the contribution of each tree by multiplying the predictions of each tree by the learning rate. Smaller learning rates generally require more trees to capture complex relationships but may improve generalization. The values provided—0.5, 0.6, 0.7, 0.9, and 1—represent different learning rates that can be tested.

- `max_depth`: This parameter controls the maximum depth of each individual tree in the boosting sequence. A larger `max_depth` allows the trees to grow deeper and potentially capture more complex patterns in the data. However, larger depths can also lead to overfitting. The values provided—3, 5, 7, and 11—represent different choices for the maximum depth of the trees.

The purpose of defining these parameters within the dictionary is typically for hyperparameter tuning or performing a grid search or randomized search over these hyperparameters. By specifying different values for these parameters, a data scientist or a machine learning practitioner can explore different combinations of these hyperparameters to find the set that yields the best model performance. This process helps in finding the optimal combination of hyperparameters for the given dataset, improving the model's predictive accuracy and generalization to new data.

`learning_rate`


The learning rate refers to a hyperparameter that determines the step size or rate at which a model learns or updates its parameters during training. It's a value that scales the magnitude of the updates made to the model's parameters in response to the calculated errors.

In simpler terms, think of the learning rate as the size of the steps a model takes while learning from the data. A larger learning rate means the model learns faster, taking larger steps, which might help it converge to a solution more quickly. However, using too large of a learning rate can cause the model to overshoot or miss the optimal solution and could lead to instability or divergence.

On the other hand, a smaller learning rate means the model learns more slowly, taking smaller steps. This might make the learning process more precise and stable, potentially allowing the model to find a better solution. However, using an excessively small learning rate might make the training process very slow or get stuck in local minima, affecting the model's ability to learn effectively.

Choosing an appropriate learning rate is crucial because it directly impacts how effectively the model learns from the data. It's often determined through experimentation and tuning, finding a balance where the model learns efficiently without compromising accuracy or stability. Adjusting the learning rate is part of the hyperparameter tuning process in machine learning to optimize the performance of the model on unseen or test data.

In [11]:
grid = GridSearchCV(GradientBoostingClassifier(), param_grid=params, verbose=1)

grid.fit(xtrain,ytrain)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


verbose=1: This is an optional parameter that controls the verbosity of the output during the fitting process. Setting it to 1 provides additional information about the progress of the grid search.

In [12]:
grid.best_params_

{'learning_rate': 0.9, 'max_depth': 3, 'n_estimators': 110}

In [13]:
prediction(GradientBoostingClassifier(learning_rate=0.9, max_depth=3, n_estimators=90))

Train Accuracy: 1.0
Test Accuracy: 1.0
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        19
           2       1.00      1.00      1.00        21
           3       1.00      1.00      1.00        14

    accuracy                           1.00        54
   macro avg       1.00      1.00      1.00        54
weighted avg       1.00      1.00      1.00        54



In [14]:
df['Customer_Segment'].unique()

array([1, 2, 3], dtype=int64)

Below code will produce error

`prediction(XGBClassifier())`

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2], got [1 2 3]

- The XGBClassifier assumes a zero-based indexing for class labels, where the classes should be represented as integers from 0 to (number of classes - 1). This means that if your classes in the target variable y start from 1, 2, 3, etc., the classifier will expect them to be in the range 0, 1, 2, etc., respectively.
- 
When fitting a classifier in scikit-learn or any machine learning library, the class labels are typically expected to start from 0 to n-1, where n is the number of unique classes in the dataset. This convention is followed to ensure consistency and compatibility across different algorithms and libraries.

In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
# Convert the class labels in your target variable y to start from 0. For example:

In [17]:
y

0      1
1      1
2      1
3      1
4      1
      ..
173    3
174    3
175    3
176    3
177    3
Name: Customer_Segment, Length: 178, dtype: int64

In [18]:
le = LabelEncoder()
y = le.fit_transform(y)
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2], dtype=int64)

In [19]:
ytrain

138    3
104    2
78     2
36     1
93     2
      ..
71     2
106    2
14     1
92     2
102    2
Name: Customer_Segment, Length: 124, dtype: int64

In [20]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3, random_state=42)

In [21]:
ytrain

array([2, 1, 1, 0, 1, 0, 2, 1, 1, 2, 0, 0, 0, 2, 0, 0, 1, 2, 1, 0, 2, 1,
       0, 2, 1, 1, 0, 1, 0, 0, 1, 0, 0, 2, 1, 1, 1, 0, 1, 1, 1, 2, 2, 0,
       1, 2, 2, 1, 1, 0, 1, 2, 2, 1, 2, 1, 1, 1, 0, 0, 2, 0, 2, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 2, 1, 1, 1, 2, 2, 1, 0, 0, 1, 2, 2, 0, 1, 2,
       2, 2, 2, 1, 0, 1, 0, 2, 0, 0, 1, 0, 0, 2, 1, 0, 2, 2, 0, 0, 2, 2,
       2, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 0, 1, 1], dtype=int64)

In [22]:
prediction(XGBClassifier())

Train Accuracy: 1.0
Test Accuracy: 0.9629629629629629
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        19
           1       0.95      0.95      0.95        21
           2       1.00      0.93      0.96        14

    accuracy                           0.96        54
   macro avg       0.97      0.96      0.96        54
weighted avg       0.96      0.96      0.96        54



In [23]:
# Perform Hyperparameter on XGB
params = {
    'n_estimators' : [90,110,150],
    'learning_rate' : [0.1,2,3,4,1],
    'max_depth' : [3,5,7,11]
}

In [24]:
grid = GridSearchCV(XGBClassifier(), param_grid=params, verbose=1)
grid.fit(xtrain, ytrain)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


In [25]:
grid.best_params_

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 90}

In [26]:
prediction(XGBClassifier(learning_rate = 0.1, max_depth = 3, n_estimators = 90))

Train Accuracy: 1.0
Test Accuracy: 0.9629629629629629
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        19
           1       0.95      0.95      0.95        21
           2       1.00      0.93      0.96        14

    accuracy                           0.96        54
   macro avg       0.97      0.96      0.96        54
weighted avg       0.96      0.96      0.96        54



## Stacking

In [27]:
models = []

models.append(('lg',LogisticRegression()))
models.append(('dt',DecisionTreeClassifier()))
models.append(('dt1', DecisionTreeClassifier(criterion='entropy')))

In [28]:
models

[('lg', LogisticRegression()),
 ('dt', DecisionTreeClassifier()),
 ('dt1', DecisionTreeClassifier(criterion='entropy'))]

In [29]:
prediction(StackingClassifier(estimators= models, final_estimator = RandomForestClassifier(max_depth=3)))

Train Accuracy: 0.9838709677419355
Test Accuracy: 0.9814814814814815
              precision    recall  f1-score   support

           0       1.00      0.95      0.97        19
           1       0.95      1.00      0.98        21
           2       1.00      1.00      1.00        14

    accuracy                           0.98        54
   macro avg       0.98      0.98      0.98        54
weighted avg       0.98      0.98      0.98        54

