In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

In [None]:
from google.colab import files
uploaded = files.upload()

Saving Churn_Modelling.csv to Churn_Modelling (1).csv


In [None]:
# Load customer data
df = pd.read_csv("Churn_Modelling.csv")

In [None]:
df.head(10)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
8,9,15792365,He,501,France,Male,44,4,142051.07,2,0,1,74940.5,0
9,10,15592389,H?,684,France,Male,27,2,134603.88,1,1,1,71725.73,0


In [None]:
# Drop unnecessary columns. Assuming 'RowNumber' and 'CustomerId' are not relevant for analysis
df = df.drop(['RowNumber', 'CustomerId'], axis=1, errors='ignore')

In [None]:
# Check for missing values
df.isnull().sum()

Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [None]:
# If there are missing values, you could fill them with the mean (for numerical columns) or mode (for categorical columns)
df['CreditScore'].fillna(df['CreditScore'].mean(), inplace=True)

In [None]:
# Check for and handle duplicates if necessary
df.drop_duplicates(inplace=True)

In [None]:
# Optionally, convert categorical features to numerical using one-hot encoding if needed for your analysis
df = pd.get_dummies(df, columns=['Geography', 'Gender'])

In [None]:
# Create new features (example)
df['BalanceSalaryRatio'] = df['Balance'] / df['EstimatedSalary']
df['TenureByAge'] = df['Tenure'] / df['Age']

In [None]:
# Assuming 'Surname' is not relevant, drop it
df = df.drop(['Surname'], axis=1, errors='ignore')

In [None]:
# Encode categorical features (assuming 'Exited' is the target variable)
label_encoder = LabelEncoder()
df['Exited'] = label_encoder.fit_transform(df['Exited'])

In [None]:
# Define churn as a binary variable (churned = 1, active = 0)
df['Churn'] = df['Exited']  # Assuming 'Exited' indicates churn (1) or not (0)

In [None]:
# Split the data into training and testing sets
X = df.drop('Churn', axis=1)
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Make predictions
logistic_predictions = logistic_model.predict(X_test)
random_forest_predictions = random_forest_model.predict(X_test)
gradient_boosting_predictions = gradient_boosting_model.predict(X_test)

In [None]:
# Scale numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Initialize models
logistic_model = LogisticRegression()
random_forest_model = RandomForestClassifier()
gradient_boosting_model = GradientBoostingClassifier()

In [None]:
# Train models
logistic_model.fit(X_train, y_train)
random_forest_model.fit(X_train, y_train)
gradient_boosting_model.fit(X_train, y_train)

In [None]:
# Evaluate models
def evaluate_model(y_true, y_pred):
  accuracy = accuracy_score(y_true, y_pred)
  precision = precision_score(y_true, y_pred)
  recall = recall_score(y_true, y_pred)
  roc_auc = roc_auc_score(y_true, y_pred)
  print("Accuracy:", accuracy)
  print("Precision:", precision)
  print("Recall:", recall)
  print("ROC AUC Score:", roc_auc)

In [None]:
print("Logistic Regression:")
evaluate_model(y_test, logistic_predictions)

print("\nRandom Forest:")
evaluate_model(y_test, random_forest_predictions)

print("\nGradient Boosting:")
evaluate_model(y_test, gradient_boosting_predictions)

Logistic Regression:
Accuracy: 0.8015
Precision: 0.47058823529411764
Recall: 0.08142493638676845
ROC AUC Score: 0.529511472549327

Random Forest:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
ROC AUC Score: 1.0

Gradient Boosting:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
ROC AUC Score: 1.0


In [None]:
X = df.drop('Churn', axis=1) # Use 'df' instead of 'data'
y = df['Churn'] # Use 'df' instead of 'data'
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train models (already done in preceding code)
logistic_model.fit(X_train, y_train)
random_forest_model.fit(X_train, y_train)
gradient_boosting_model.fit(X_train, y_train)

In [None]:
models = [
    ("Logistic Regression", LogisticRegression()),
    ("Random Forest", GridSearchCV(RandomForestClassifier(), param_grid={"n_estimators": [100, 200], "max_depth": [5, 10]})),
    ("Gradient Boosting", GridSearchCV(GradientBoostingClassifier(), param_grid={"learning_rate": [0.1, 0.05], "n_estimators": [100, 200]})),
]

best_model = None
best_score = 0  # Replace with your desired metric (e.g., accuracy, AUC-ROC)

for name, model in models:
    # Fit all models, regardless of whether they are wrapped in GridSearchCV
    model.fit(X_train, y_train)

    if isinstance(model, GridSearchCV):
        model = model.best_estimator_  # Use the best model from GridSearch

    # Pass y_test and model predictions to evaluate_model
    y_pred = model.predict(X_test)
    evaluate_model(y_test, y_pred)  # Changed this line

    # Update best model based on chosen metric
    current_score = accuracy_score(y_test, y_pred)  # Replace with your metric
    if current_score > best_score:
        best_model = model
        best_score = current_score

# Print the chosen model
print("\nBest Performing Model:")
print(f"\tName: {best_model.__class__.__name__}")


Accuracy: 0.8015
Precision: 0.47058823529411764
Recall: 0.08142493638676845
ROC AUC Score: 0.529511472549327
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
ROC AUC Score: 1.0
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
ROC AUC Score: 1.0

Best Performing Model:
	Name: RandomForestClassifier
