# **Telco Customer Churn Prediction using Random Forest Model**
---
Dataset link : https://www.kaggle.com/datasets/yeanzc/telco-customer-churn-ibm-dataset

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
import random
from sklearn import preprocessing
from sklearn.metrics import precision_score, recall_score, f1_score

# **Read the** **Data**

In [None]:
churn=pd.read_excel('/content/Telco_customer_churn.xlsx')    #path of data.csv

# **Explore , understand and get values of the dataset to plan data cleaning**


In [None]:
churn.head(10)

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Churn Reason
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,86,3239,Competitor made better offer
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,...,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,67,2701,Moved
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,...,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,86,5372,Moved
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,...,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,84,5003,Moved
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,...,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,89,5340,Competitor had better devices
5,4190-MFLUW,1,United States,California,Los Angeles,90020,"34.066367, -118.309868",34.066367,-118.309868,Female,...,Month-to-month,No,Credit card (automatic),55.2,528.35,Yes,1,78,5925,Competitor offered higher download speeds
6,8779-QRDMV,1,United States,California,Los Angeles,90022,"34.02381, -118.156582",34.02381,-118.156582,Male,...,Month-to-month,Yes,Electronic check,39.65,39.65,Yes,1,100,5433,Competitor offered more data
7,1066-JKSGK,1,United States,California,Los Angeles,90024,"34.066303, -118.435479",34.066303,-118.435479,Male,...,Month-to-month,No,Mailed check,20.15,20.15,Yes,1,92,4832,Competitor made better offer
8,6467-CHFZW,1,United States,California,Los Angeles,90028,"34.099869, -118.326843",34.099869,-118.326843,Male,...,Month-to-month,Yes,Electronic check,99.35,4749.15,Yes,1,77,5789,Competitor had better devices
9,8665-UTDHZ,1,United States,California,Los Angeles,90029,"34.089953, -118.294824",34.089953,-118.294824,Male,...,Month-to-month,No,Electronic check,30.2,30.2,Yes,1,97,2915,Competitor had better devices


# **Identify any potential problems, such as missing data or data types  are not compatible**


In [None]:
churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 33 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CustomerID         7043 non-null   object 
 1   Count              7043 non-null   int64  
 2   Country            7043 non-null   object 
 3   State              7043 non-null   object 
 4   City               7043 non-null   object 
 5   Zip Code           7043 non-null   int64  
 6   Lat Long           7043 non-null   object 
 7   Latitude           7043 non-null   float64
 8   Longitude          7043 non-null   float64
 9   Gender             7043 non-null   object 
 10  Senior Citizen     7043 non-null   object 
 11  Partner            7043 non-null   object 
 12  Dependents         7043 non-null   object 
 13  Tenure Months      7043 non-null   int64  
 14  Phone Service      7043 non-null   object 
 15  Multiple Lines     7043 non-null   object 
 16  Internet Service   7043 

# **Check for missing values in each column**


In [None]:
missing_values = churn.isnull().sum()

# Determine the maximum column width
max_width = max(len(str(column)) for column in missing_values.index)

for column, missing in missing_values.items():
    print("{:<{width}}{}".format(column, missing > 0, width=max_width))

CustomerID       False
Count            False
Country          False
State            False
City             False
Zip Code         False
Lat Long         False
Latitude         False
Longitude        False
Gender           False
Senior Citizen   False
Partner          False
Dependents       False
Tenure Months    False
Phone Service    False
Multiple Lines   False
Internet Service False
Online Security  False
Online Backup    False
Device ProtectionFalse
Tech Support     False
Streaming TV     False
Streaming Movies False
Contract         False
Paperless BillingFalse
Payment Method   False
Monthly Charges  False
Total Charges    False
Churn Label      False
Churn Value      False
Churn Score      False
CLTV             False
Churn Reason     True


# **Dataset Shape**


In [None]:
churn.shape

(7043, 33)

# **Check if there's a Duplicated Rows**


In [None]:
churn.duplicated().sum()

0

# **Count Missing Values on 'Churn Reason' column**


In [None]:
churn['Churn Reason'].isnull().sum()

5174

# **Remove unwanted columns 'Churn Reason'**


In [None]:
#The axis=1 argument tells Pandas to drop the columns, rather than the rows.
churn = churn.drop(['Churn Reason','Churn Value'], axis=1)

# **Verify that the removal was successful**

In [None]:
if 'Churn Reason' not in churn and 'Churn Value' not in churn :
    print('The Churn Reason and Churn Value columns have been removed.')

else:
    print('The Churn Reason and Churn Value columns have not been removed.')

The Churn Reason and Churn Value columns have been removed.


# **Verify that dataset has no null values**

In [None]:
churn.isnull().sum().sum()

0

# **Data Type Conversion**

In [None]:
encoder = preprocessing.LabelEncoder()

for i in churn.columns:
  if churn[i].dtype == 'object':
    churn[i] = churn[i].astype(str)
    churn[i] = encoder.fit_transform(churn[i])

# **Check if all columns are numeric**


In [None]:
if churn.select_dtypes(include=['number']).shape[1] == churn.shape[1]:
    print("All columns are numeric.")
else:
    print("Dataset contains non-numeric columns.")

All columns are numeric.


# **Dataset after preprocessing**

In [None]:
churn

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Score,CLTV
0,2564,1,0,0,562,90003,327,33.964131,-118.272783,1,...,0,0,0,1,3,53.85,157,1,86,3239
1,6511,1,0,0,562,90005,405,34.059281,-118.307420,0,...,0,0,0,1,2,70.70,925,1,67,2701
2,6551,1,0,0,562,90006,393,34.048013,-118.293953,0,...,2,2,0,1,2,99.65,6104,1,86,5372
3,5604,1,0,0,562,90010,410,34.062125,-118.315709,0,...,2,2,0,1,2,104.80,2646,1,84,5003
4,174,1,0,0,562,90015,385,34.039224,-118.266293,1,...,2,2,0,1,0,103.70,4265,1,89,5340
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1758,1,0,0,521,92285,587,34.341737,-116.539416,0,...,1,1,2,1,0,21.15,770,0,45,5306
7039,4853,1,0,0,2,92301,636,34.667815,-117.536183,1,...,2,2,1,1,3,84.80,1597,0,59,2140
7040,1525,1,0,0,22,92304,625,34.559882,-115.637164,0,...,2,2,1,1,1,103.20,5698,0,71,5560
7041,3367,1,0,0,26,92305,512,34.167800,-116.864330,0,...,0,0,0,1,2,29.60,2994,0,59,2793


# **Split the DataFrame into a training set=75% and a testing set=25%**


In [None]:

#The x variable --> independent variables,
X = churn.drop('Churn Label', axis=1)

#The y variable --> dependent'Target' variable.
y = churn['Churn Label']

#The random_state parameter is used to ensure that the data is split randomly.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Print the shape of the training set-->(row,column)
print(X_train.shape)

# Print the shape of the testing set-->(row,column)
print(X_test.shape)


(5282, 30)
(1761, 30)


# **Random Forest model**

In [None]:
clf_rf = RandomForestClassifier()

#fit--> creating a number of decision trees and  trained on a subset of the training data.
#The decision trees are combined to make a prediction.
clf_rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf_rf.predict(X_test)

# **Evaluate the performance of the model**

In [None]:
# Accuracy score: Compare the predicted labels (y_pred) with the actual labels (y_test).
accuracy = round(accuracy_score(y_test, y_pred)*100,2)
precision = round(precision_score(y_test, y_pred)*100,2)
recall = round(recall_score(y_test, y_pred)*100,2)
F1_score = round(f1_score(y_test, y_pred)*100,2)

print("Accuracy:", accuracy,"%")
print("Precision:", precision,"%")
print("Recall:", recall,"%")
print("F1-score:", F1_score,"%")

Accuracy: 92.5 %
Precision: 88.63 %
Recall: 84.37 %
F1-score: 86.45 %


# **Feature Selection by PSO Algorithm**
--------------------------------------

# **Calculate Accuracy for each chromosome**

In [None]:
def calculate_accuracy(X_selected):
    X=churn[X_selected]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    clf_rf = RandomForestClassifier()
    clf_rf.fit(X_train, y_train)
    accuracy=round(clf_rf.score(X_test,y_test)*100,2)
    print('Accuracy:',accuracy,"%")
    return accuracy

# **Convert Features to list**

In [None]:
features=churn.drop(labels= 'Churn Label', axis= 1).columns.values.tolist()

# **Features Names**

In [None]:
print(features)

['CustomerID', 'Count', 'Country', 'State', 'City', 'Zip Code', 'Lat Long', 'Latitude', 'Longitude', 'Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Tenure Months', 'Phone Service', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing', 'Payment Method', 'Monthly Charges', 'Total Charges', 'Churn Score', 'CLTV']


# **Choose Random Features For Each Chromosome**

In [None]:
num_rows, features_num = X.shape
print(features_num)

chromosome_size=10
features_probability=[0,1]
#All chromosomes with features probability , cont
chromosomes=[]
#chromosomes
for i in range(chromosome_size):
    chromosome_with_features_probability=[]
    #feature/column
    for i in range(features_num):
        item = random.choice(tuple(features_probability))
        chromosome_with_features_probability.append(item)
    chromosomes.append(chromosome_with_features_probability)

30


# **Take The Selected Features Only**

In [None]:
def data(chromosome):
    chromosomes_with_selected_features=[]
    for i in range(len(chromosome)):
        if chromosome[i]==1:
                chromosomes_with_selected_features.append(features[i])
    return chromosomes_with_selected_features

# **Calculate personal best accuracy for  chromosomes**

In [None]:
old_fitness_values=[]

def calculate_fitness_values():
    for i in range(len(chromosomes)):
         old_fitness_values.append(calculate_accuracy(data(chromosomes[i])))

calculate_fitness_values()

Accuracy: 76.09 %
Accuracy: 92.45 %
Accuracy: 76.43 %
Accuracy: 72.46 %
Accuracy: 73.99 %
Accuracy: 78.53 %
Accuracy: 91.94 %
Accuracy: 90.06 %
Accuracy: 78.59 %
Accuracy: 77.85 %


# **Calculate Personal Best**

Store its value in The Original list"chromosome"

In [None]:
def Calculate_PersonalBest(chromosomes1):
    new_fitness_values=[]
    for i in range(len(chromosomes1)):
        new_fitness_values.append(calculate_accuracy(data(chromosomes1[i])))
    for j in range(len(new_fitness_values)):
        if(new_fitness_values[j]>old_fitness_values[j]):
            chromosomes[j]=chromosomes1[j]
            old_fitness_values[j]=new_fitness_values[j]
    return new_fitness_values

# **Calculate Velocity**

In [None]:
def checkvelocity(globalbest):
    velocity=[]
    for j in range(len(chromosomes)):
        velocity.append(list(0+1*(np.random.random(1)[0])*(np.array(chromosomes[j])-np.array(chromosomes[j]))+1*(np.random.random(1)[0])*(np.array(globalbest)-np.array(chromosomes[j]))))
    #print(velocity)
    return velocity

# **New Position**

In [None]:
def addingchromosomes(velocity):
    chromosomes2=[]
    for i in range(len(velocity)):
        nextchromo=[]
        for j in range(len(velocity[i])):
            nextchromo.append(chromosomes[i][j]+velocity[i][j])
        chromosomes2.append(nextchromo)
    return chromosomes2

# **Normalization**

In [None]:
def normalize(chromosomes2):
    for l in range(len(chromosomes2)):
        for m in range(len(chromosomes2[l])):
            if chromosomes2[l][m]>0.5:
                chromosomes2[l][m]=1
            else:
                chromosomes2[l][m]=0
    return chromosomes2

# **Calculate Global Best**

In [None]:
max(old_fitness_values)
ind = old_fitness_values.index(max(old_fitness_values))
globalbest=chromosomes[ind]
for i in range(5):
    chromosomes2=[]
    personal=[]
    velocity=checkvelocity(globalbest)
    chromosomes2=addingchromosomes(velocity)
    chromosomes2=normalize(chromosomes2)
    personal=Calculate_PersonalBest(chromosomes2)
    globalbest=[]
    max(old_fitness_values)
    ind = old_fitness_values.index(max(old_fitness_values))
    globalbest=chromosomes[ind]

Accuracy: 92.84 %
Accuracy: 92.62 %
Accuracy: 92.9 %
Accuracy: 92.62 %
Accuracy: 92.79 %
Accuracy: 92.96 %
Accuracy: 92.79 %
Accuracy: 92.45 %
Accuracy: 79.1 %
Accuracy: 77.97 %
Accuracy: 92.45 %
Accuracy: 92.28 %
Accuracy: 92.73 %
Accuracy: 92.5 %
Accuracy: 92.28 %
Accuracy: 92.62 %
Accuracy: 92.5 %
Accuracy: 92.39 %
Accuracy: 92.45 %
Accuracy: 77.0 %
Accuracy: 92.33 %
Accuracy: 92.79 %
Accuracy: 92.67 %
Accuracy: 93.02 %
Accuracy: 92.5 %
Accuracy: 92.45 %
Accuracy: 92.5 %
Accuracy: 92.67 %
Accuracy: 92.96 %
Accuracy: 92.67 %
Accuracy: 92.45 %
Accuracy: 92.62 %
Accuracy: 92.67 %
Accuracy: 92.67 %
Accuracy: 92.67 %
Accuracy: 92.79 %
Accuracy: 92.62 %
Accuracy: 92.5 %
Accuracy: 92.84 %
Accuracy: 92.79 %
Accuracy: 92.79 %
Accuracy: 92.73 %
Accuracy: 92.96 %
Accuracy: 92.45 %
Accuracy: 92.84 %
Accuracy: 92.56 %
Accuracy: 92.5 %
Accuracy: 92.84 %
Accuracy: 92.9 %
Accuracy: 92.73 %


# **Max Fitness Value**

In [None]:
max(old_fitness_values)

93.02

# **Global Best position**

In [None]:
ind = old_fitness_values.index(max(old_fitness_values))
print(ind)
globalbest=chromosomes[ind]

3


# **Selected Features**

In [None]:
Selected_Features=data(globalbest)
print(Selected_Features)

['CustomerID', 'Count', 'Country', 'State', 'City', 'Zip Code', 'Latitude', 'Longitude', 'Senior Citizen', 'Partner', 'Dependents', 'Tenure Months', 'Phone Service', 'Internet Service', 'Tech Support', 'Streaming TV', 'Streaming Movies', 'Payment Method', 'Total Charges', 'Churn Score']


# **Evalution after Feature Selection**

In [None]:
#The x variable --> independent variables
X1=churn[Selected_Features]

#The y variable --> dependent'Target' variable.
y1 = churn['Churn Label']

#The random_state parameter is used to ensure that the data is split randomly.
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.25, random_state=42)

# Print the shape of the training set-->(row,column)
print(X_train1.shape)

# Print the shape of the testing set-->(row,column)
print(X_test1.shape)

(5282, 20)
(1761, 20)


In [None]:
clf_rf = RandomForestClassifier()

#fit--> creating a number of decision trees and  trained on a subset of the training data.
#The decision trees are combined to make a prediction.
clf_rf.fit(X_train1, y_train1)

# Make predictions on the test set
y_pred1 = clf_rf.predict(X_test1)

In [None]:
# Accuracy score: Compare the predicted labels (y_pred) with the actual labels (y_test).
accuracy1 = round(accuracy_score(y_test1, y_pred1)*100,2)
precision1 = round(precision_score(y_test1, y_pred1)*100,2)
recall1 = round(recall_score(y_test1, y_pred1)*100,2)
F1_score1 = round(f1_score(y_test1, y_pred1)*100,2)

print("Accuracy:", accuracy1,"%")
print("Precision:", precision1,"%")
print("Recall:", recall1,"%")
print("F1-score:", F1_score1,"%")

Accuracy: 93.13 %
Precision: 89.38 %
Recall: 85.97 %
F1-score: 87.64 %


# **HyperParameter Optimization**
---

In [None]:
pip install pso-optimizer

Collecting pso-optimizer
  Downloading pso_optimizer-1.0.1-py3-none-any.whl (5.1 kB)
Installing collected packages: pso-optimizer
Successfully installed pso-optimizer-1.0.1


# **Search for best HyperParameter**

In [24]:
# RANDOM FOREST
criterion_map_rf = {
     0: "gini",
     1: "entropy",
     2: "log_loss"
 }

min_samples_split_map_rf = {
     0: 2,
     1: 5,
     2: 10
 }

max_features_map_rf = {
    0: "sqrt",
    1: "log2",
    2: None
}


import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from joblib import Parallel, delayed
from pso_optimizer.hyperparameter_mappings import (
    criterion_map_rf,
    max_features_map_rf,
    min_samples_split_map_rf,
)

class PSOOptimizer:

    def __init__(self, estimator, random_seed=42):
        self.estimator = estimator
        self.random_seed = random_seed

    def pso_hyperparameter_optimization(self, X_train, X_test, y_train, y_test, num_particles, num_iterations, c1=1, c2=1, num_jobs=-1, w=0):
        if self.random_seed is not None:
            np.random.seed(self.random_seed)
        hyperparameter_space = self._get_hyperparameter_space()

        # Initialize the population of particles
        population = []
        for _ in range(num_particles):
            hyperparameters = [np.random.choice(hyperparameter_space[param]) for param in hyperparameter_space]
            population.append(hyperparameters)

        # Initialize velocity and best position
        velocity = [[0] * len(hyperparameter_space) for _ in range(num_particles)]
        best_position = population.copy()
        global_best_fitness = -float("inf")
        global_best_position = []

        # PSO optimization loop
        for _ in range(num_iterations):
            fitness = Parallel(n_jobs=num_jobs)(
                delayed(self.evaluate_fitness)(X_train, X_test, y_train, y_test, particle)
                for particle in population
            )

            for j, particle in enumerate(population):
                if fitness[j] > self.evaluate_fitness(X_train, X_test, y_train, y_test, best_position[j]):
                    best_position[j] = particle

            if max(fitness) > global_best_fitness:
                global_best_fitness = max(fitness)
                global_best_position = population[fitness.index(max(fitness))]

            for j, particle in enumerate(population):
                r1 = np.random.uniform(0, 1)
                r2 = np.random.uniform(0, 1)
                velocity[j] = [w * velocity[j][k] + c1 * r1 * (best_position[j][k] - particle[k]) + c2 * r2 * (global_best_position[k] - particle[k]) for k in range(len(hyperparameter_space))]

                for k in range(len(hyperparameter_space)):
                    particle[k] += velocity[j][k]
                    particle[k] = max(min(particle[k], max(hyperparameter_space[list(hyperparameter_space.keys())[k]])),
                        min(hyperparameter_space[list(hyperparameter_space.keys())[k]]))

        return global_best_position, global_best_fitness

    def evaluate_fitness(self, X_train, X_test, y_train, y_test, hyperparameters):
        # Unpack hyperparameters
        estimator_instance = self._create_estimator(hyperparameters)

        estimator_instance.fit(X_train, y_train)
        y_pred = estimator_instance.predict(X_test)
        accuracy_pso = accuracy_score(y_test, y_pred)
        return accuracy_pso

    def _create_estimator(self, hyperparameters):
        # Train the estimator using the hyperparameters
        estimator_instance = None

        if self.estimator == "RF":
            n_estimators_values, max_depth_values, criterion_values, min_samples_split_values, min_samples_leaf_values, min_weight_fraction_leaf_values, max_features_values = hyperparameters
            estimator_instance = RandomForestClassifier(random_state=self.random_seed, n_estimators=int(n_estimators_values), max_depth=int(max_depth_values),
                                                        criterion=criterion_map_rf[round(criterion_values)],
                                                        min_samples_split=min_samples_split_map_rf[round(min_samples_split_values)],
                                                        min_samples_leaf=int(min_samples_leaf_values),
                                                        min_weight_fraction_leaf=min_weight_fraction_leaf_values,
                                                        max_features=max_features_map_rf[round(max_features_values)]
                                                        )

        else:
            raise ValueError("Estimator not supported.")

        return estimator_instance

    def get_hyperparameters(self, best_hyperparameters):
        if self.estimator == "RF":
            print("Optimal hyperparameters:")
            print(f"n_estimators_value: {best_hyperparameters[0]}")
            print(f"max_depth_value: {best_hyperparameters[1]}")
            print(f"Criterion: {criterion_map_rf[best_hyperparameters[2]]}")
            print(f"min_samples_split_values: {min_samples_split_map_rf[best_hyperparameters[3]]}")
            print(f"min_samples_leaf_values: {best_hyperparameters[4]}")
            print(f"min_weight_fraction_leaf_values: {best_hyperparameters[5]}")
            print(f"max_features_values: {max_features_map_rf[best_hyperparameters[6]]}")
        else:
            raise ValueError("Estimator not supported")



    def _get_hyperparameter_space(self):
        # Define hyperparameter space based on the estimator
        if self.estimator == "RF":
            hyperparameter_space = {
                "n_estimators_values": [100, 200,300,400, 500],
                "max_depth_values": [5, 10, 20],
                "criterion_values": [0, 1, 2],
                "min_samples_split_values": [0, 1, 2],
                "min_samples_leaf_values": [1, 2, 3, 4, 5],
                "min_weight_fraction_leaf_values": [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
                "max_features_values": [0, 1, 2]
            }
        else:
            raise ValueError("Estimator not supported.")
        return hyperparameter_space



In [26]:
optimizer = PSOOptimizer(estimator="RF")

# The x variable --> independent variables
X2 = churn.drop('Churn Label', axis=1)

# The y variable --> dependent'Target' variable
y2 = churn['Churn Label']

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.25, random_state=42)

# Perform hyperparameter optimization
best_hyperparameters, best_score = optimizer.pso_hyperparameter_optimization(X_train2, X_test2, y_train2, y_test2, num_particles=10, num_iterations=5)

print("best score = ",best_score)
# Print best hyperparameters
optimizer.get_hyperparameters(best_hyperparameters=best_hyperparameters)


best score =  0.9273140261215219
Optimal hyperparameters:
n_estimators_value: 100.0
max_depth_value: 10.0
Criterion: entropy
min_samples_split_values: 2
min_samples_leaf_values: 4.0
min_weight_fraction_leaf_values: 0.0
max_features_values: sqrt


# **Radom Forest Model**

In [27]:
clf_rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    criterion='gini',
    min_samples_split=2,
    min_samples_leaf=4,
    min_weight_fraction_leaf=0.0,  # Corrected the syntax and type
    max_features='sqrt'
)

#fit--> creating a number of decision trees and  trained on a subset of the training data.
#The decision trees are combined to make a prediction.
clf_rf.fit(X_train2, y_train2)

# Make predictions on the test set
y_pred2= clf_rf.predict(X_test2)

# **Evaluation after Hyperparameter optimization**

In [28]:
# Accuracy score: Compare the predicted labels (y_pred) with the actual labels (y_test).
accuracy2 = round(accuracy_score(y_test2, y_pred2)*100,2)
precision2 = round(precision_score(y_test2, y_pred2)*100,2)
recall2 = round(recall_score(y_test2, y_pred2)*100,2)
F1_score2 = round(f1_score(y_test2, y_pred2)*100,2)

print("Accuracy:", accuracy2,"%")
print("Precision:", precision2,"%")
print("Recall:", recall2,"%")
print("F1-score:", F1_score2,"%")

Accuracy: 92.84 %
Precision: 88.14 %
Recall: 86.37 %
F1-score: 87.25 %
