<a href="https://colab.research.google.com/github/ahmad-smasri/CODSOFT/blob/main/Project3_Customer_Churn_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Customer Churn Prediction

## Libraries to be Used

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Reading CSV File

In [None]:
data_file = pd.read_csv("/content/drive/MyDrive/Project 3 Dataset/Churn_Modelling.csv")
data = pd.DataFrame(data_file)

## Removing unneeded columns

In [None]:
needed_data = data.drop(["RowNumber", "CustomerId", "Surname"], axis =1)
print(needed_data)

      CreditScore Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
0             619    France  Female   42       2       0.00              1   
1             608     Spain  Female   41       1   83807.86              1   
2             502    France  Female   42       8  159660.80              3   
3             699    France  Female   39       1       0.00              2   
4             850     Spain  Female   43       2  125510.82              1   
...           ...       ...     ...  ...     ...        ...            ...   
9995          771    France    Male   39       5       0.00              2   
9996          516    France    Male   35      10   57369.61              1   
9997          709    France  Female   36       7       0.00              1   
9998          772   Germany    Male   42       3   75075.31              2   
9999          792    France  Female   28       4  130142.79              1   

      HasCrCard  IsActiveMember  EstimatedSalary  Exited  
0   

## Encoding Geography and Gender Categories

In [None]:
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
le = LabelEncoder()

#Geography
# Fit the encoder to the categorical data
le.fit(needed_data["Geography"])

# Transform the categorical data into numerical labels
encoded_geo = le.transform(needed_data["Geography"])
needed_data["Geography"] = encoded_geo

#Gender
# Fit the encoder to the categorical data
le.fit(needed_data["Gender"])

# Transform the categorical data into numerical labels
encoded_gen = le.transform(needed_data["Gender"])
needed_data["Gender"] = encoded_gen

print(needed_data)


      CreditScore  Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
0             619          0       0   42       2       0.00              1   
1             608          2       0   41       1   83807.86              1   
2             502          0       0   42       8  159660.80              3   
3             699          0       0   39       1       0.00              2   
4             850          2       0   43       2  125510.82              1   
...           ...        ...     ...  ...     ...        ...            ...   
9995          771          0       1   39       5       0.00              2   
9996          516          0       1   35      10   57369.61              1   
9997          709          0       0   36       7       0.00              1   
9998          772          1       1   42       3   75075.31              2   
9999          792          0       0   28       4  130142.79              1   

      HasCrCard  IsActiveMember  EstimatedSalary  E

## Preprocessing and Splitting Data

### Distributing Data columns between features and target

In [None]:
#Features
feature_data = needed_data[["CreditScore", "Geography", "Gender", "Age", "Tenure", "Balance",
                            "Balance", "NumOfProducts", "HasCrCard", "IsActiveMember", "EstimatedSalary"]]
#Target
Customer_Churn = needed_data[["Exited"]]

619

### Preprocessing Data

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

#Fitting features
scaler.fit(feature_data)

#Transforming scaled data
feature_data = scaler.transform(feature_data)

### Data Splitting

In [170]:
from sklearn.model_selection import train_test_split

#Splitting Data
X_train, X_test, Y_train, Y_test = train_test_split(feature_data, Customer_Churn, test_size=0.25, random_state=42)
print(Y_test)

      Exited
6252       0
4684       0
1731       0
4742       0
4521       0
...      ...
4862       0
7025       1
7647       0
7161       0
73         0

[2500 rows x 1 columns]


## Train and Test

### Accuracy key indicator

In [171]:
from sklearn.metrics import r2_score


def predict(dataset, target, model):
  # Predict the target column using the model
  predictions = model.predict(dataset).round(0)
  # Create a new Dataframe to compare the target values and the predicted values
  output = dataset.copy()
  # Adding the target values in a 'target' column
  output['Actual'] = target
  # Adding the predicted values in a 'prediction' column
  output['Prediction'] = predictions
  # Calculate the absolute errors
  errors = abs(output['Actual'] - output['Prediction'])
  # Calculate the mean absolute error (mae)
  loss = np.mean(errors)
  # Print out the mean absolute error (mae)
  print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
  # Calculate mean absolute percentage error (MAPE)
  mape = 100 * (errors / target.shape[0])
  # Calculate and display accuracy
  Error = 100 * loss/target.shape[0]
  print('Accuracy:', 100 - round(Error, 2), '%.')
  #R2
  R2=np.corrcoef(output['Actual'], output['Prediction'])[0, 1]**2
  print('R2 :', round(R2*100, 2), '%.')

  # Returning the output Dataframe'''
  return output

### 1. Using Logistic Regression

In [177]:
from sklearn.linear_model import LogisticRegression

# Instantiate model
logreg = LogisticRegression(max_iter=1000)

# Train the model on training data
logreg.fit(X_train, Y_train)

# Testing the model on the test data using the test target and displaying it
test_output_log = predict(X_test, Y_test, logreg)
test_output_log

  y = column_or_1d(y, warn=True)


Mean Absolute Error: 0.2 degrees.
Accuracy: 99.99 %.
R2 : 1.83 %.


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,Balance.1,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Actual,Prediction
6252,596,1,1,32,3,96709.07,96709.07,2,0,0,41788.37,0,0
4684,623,0,1,43,1,0.00,0.00,2,1,1,146379.30,0,0
1731,601,2,0,44,4,0.00,0.00,2,1,0,58561.31,0,0
4742,506,1,1,59,8,119152.10,119152.10,2,1,1,170679.74,0,1
4521,560,2,0,27,7,124995.98,124995.98,1,1,1,114669.79,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4862,645,2,0,55,1,133676.65,133676.65,1,0,1,17095.49,0,0
7025,569,2,0,51,3,0.00,0.00,3,1,0,75084.96,1,0
7647,768,0,1,25,0,78396.08,78396.08,1,1,1,8316.19,0,0
7161,690,0,0,36,6,110480.48,110480.48,1,0,0,81292.33,0,0


### 2. Using Random Forest

In [178]:
from sklearn.ensemble import RandomForestRegressor

# Instantiate model with 1000 decision trees
rf1 = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
rf1.fit(X_train, Y_train)

# Testing the model on the test data using the test target and displaying it
test_output_rf1 = predict(X_test, Y_test, rf1)
test_output_rf1


  rf1.fit(X_train, Y_train)


Mean Absolute Error: 0.14 degrees.
Accuracy: 99.99 %.
R2 : 26.66 %.


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,Balance.1,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Actual,Prediction
6252,596,1,1,32,3,96709.07,96709.07,2,0,0,41788.37,0,0.0
4684,623,0,1,43,1,0.00,0.00,2,1,1,146379.30,0,0.0
1731,601,2,0,44,4,0.00,0.00,2,1,0,58561.31,0,0.0
4742,506,1,1,59,8,119152.10,119152.10,2,1,1,170679.74,0,0.0
4521,560,2,0,27,7,124995.98,124995.98,1,1,1,114669.79,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4862,645,2,0,55,1,133676.65,133676.65,1,0,1,17095.49,0,0.0
7025,569,2,0,51,3,0.00,0.00,3,1,0,75084.96,1,1.0
7647,768,0,1,25,0,78396.08,78396.08,1,1,1,8316.19,0,0.0
7161,690,0,0,36,6,110480.48,110480.48,1,0,0,81292.33,0,0.0


### 3. Using Gradient Boosting

In [175]:
from sklearn.ensemble import GradientBoostingRegressor

# Instantiate model
gb = GradientBoostingRegressor(n_estimators=1000, random_state=42)

# Train the model on training data
gb.fit(X_train, Y_train)

# Testing the model on the test data using the test target and displaying it
test_output_gb = predict(X_test, Y_test, gb)
test_output_gb

  y = column_or_1d(y, warn=True)


Mean Absolute Error: 0.14 degrees.
Accuracy: 99.99 %.
R2 : 27.08 %.


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,Balance.1,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Actual,Prediction
6252,596,1,1,32,3,96709.07,96709.07,2,0,0,41788.37,0,0.0
4684,623,0,1,43,1,0.00,0.00,2,1,1,146379.30,0,0.0
1731,601,2,0,44,4,0.00,0.00,2,1,0,58561.31,0,0.0
4742,506,1,1,59,8,119152.10,119152.10,2,1,1,170679.74,0,0.0
4521,560,2,0,27,7,124995.98,124995.98,1,1,1,114669.79,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4862,645,2,0,55,1,133676.65,133676.65,1,0,1,17095.49,0,1.0
7025,569,2,0,51,3,0.00,0.00,3,1,0,75084.96,1,1.0
7647,768,0,1,25,0,78396.08,78396.08,1,1,1,8316.19,0,0.0
7161,690,0,0,36,6,110480.48,110480.48,1,0,0,81292.33,0,0.0
