# CUSTOMER CHURN PREDICTION


# PROBLEM STATEMENT: Develop a model to predict customer churn for a subscription based service or business. Use historical customer data, including features like usage behavior and customer demographics, and try algorithms like Logistic Regression, Random Forests, or Gradient Boosting to predict churn.


In [1]:
import pandas as pd

In [4]:
file_path = 'C:/Users/Adarsh Poojary/CODSOFT/ML_TASK_2/Churn_Modelling.csv'
data = pd.read_csv(file_path)

In [5]:
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [6]:
data

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [29]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix
from sklearn.preprocessing import StandardScaler

In [8]:
data_cleaned = data.drop(columns=['RowNumber', 'CustomerId', 'Surname'])

In [9]:
label_encoder = LabelEncoder()
data_cleaned['Geography'] = label_encoder.fit_transform(data_cleaned['Geography'])
data_cleaned['Gender'] = label_encoder.fit_transform(data_cleaned['Gender'])

In [10]:
X = data_cleaned.drop(columns=['Exited'])
y = data_cleaned['Exited']

In [11]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [13]:
logreg = LogisticRegression(random_state=42)
rf = RandomForestClassifier(random_state=42)
gb = GradientBoostingClassifier(random_state=42)

In [27]:
models = [logreg, rf, gb]
model_names = ['Logistic Regression', 'Random Forest', 'Gradient Boosting']
results = {}
for model, name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    results[name] = {'Accuracy': accuracy, 'Classification Report': report}

In [32]:
results

{'Logistic Regression': {'Accuracy': 0.815,
  'Classification Report': '              precision    recall  f1-score   support\n\n           0       0.83      0.97      0.89      1607\n           1       0.60      0.18      0.28       393\n\n    accuracy                           0.81      2000\n   macro avg       0.71      0.58      0.59      2000\nweighted avg       0.78      0.81      0.77      2000\n'},
 'Random Forest': {'Accuracy': 0.864,
  'Classification Report': '              precision    recall  f1-score   support\n\n           0       0.88      0.96      0.92      1607\n           1       0.75      0.47      0.57       393\n\n    accuracy                           0.86      2000\n   macro avg       0.81      0.71      0.75      2000\nweighted avg       0.85      0.86      0.85      2000\n'},
 'Gradient Boosting': {'Accuracy': 0.8655,
  'Classification Report': '              precision    recall  f1-score   support\n\n           0       0.88      0.96      0.92      1607\n   