<a href="https://colab.research.google.com/github/abdoubsa/Churn-detection-model/blob/main/churn_detection_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

target = train['Churn']

print(train.head())


   id  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0   0    Male              0     Yes         No      51          Yes   
1   1  Female              0      No         No      11          Yes   
2   2  Female              0      No         No      16          Yes   
3   3  Female              0      No         No      34          Yes   
4   4  Female              1     Yes         No      43          Yes   

  MultipleLines InternetService       OnlineSecurity  ...  \
0           Yes             DSL                   No  ...   
1            No             DSL                  Yes  ...   
2            No              No  No internet service  ...   
3           Yes             DSL                   No  ...   
4           Yes     Fiber optic                   No  ...   

      DeviceProtection          TechSupport          StreamingTV  \
0                   No                  Yes                  Yes   
1                   No                  Yes                   No

In [None]:
print("Train shape:", train.shape)
print("Test shape:", test.shape)

print(target.value_counts())

print("\nTrain info:")
print(train.info())

print("\nMissing values in train:")
print(train.isnull().sum().sort_values(ascending=False).head(10))

Train shape: (5634, 21)
Test shape: (1409, 20)
Churn
No     4139
Yes    1495
Name: count, dtype: int64

Train info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5634 entries, 0 to 5633
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                5634 non-null   int64  
 1   gender            5634 non-null   object 
 2   SeniorCitizen     5634 non-null   int64  
 3   Partner           5634 non-null   object 
 4   Dependents        5634 non-null   object 
 5   tenure            5634 non-null   int64  
 6   PhoneService      5634 non-null   object 
 7   MultipleLines     5634 non-null   object 
 8   InternetService   5634 non-null   object 
 9   OnlineSecurity    5634 non-null   object 
 10  OnlineBackup      5634 non-null   object 
 11  DeviceProtection  5634 non-null   object 
 12  TechSupport       5634 non-null   object 
 13  StreamingTV       5634 non-null   object 
 14  StreamingMovies   56

In [None]:
#clean data
train_clean = train.copy()
test_clean = test.copy()

train_clean['TotalCharges'] = pd.to_numeric(train_clean["TotalCharges"], errors= "coerce")
test_clean['TotalCharges'] = pd.to_numeric(test_clean["TotalCharges"], errors= "coerce")


print(train_clean["TotalCharges"].isna().sum())
train_clean.dropna(subset=["TotalCharges"], inplace=True) # Add inplace=True here to modify the DataFrame

test_clean['TotalCharges'].fillna(test_clean['TotalCharges'].mean(), inplace=True)

# Feature Engineering
# 1. AvgMonthlySpend
train_clean['AvgMonthlySpend'] = train_clean['TotalCharges'] / train_clean['tenure']
test_clean['AvgMonthlySpend'] = test_clean['TotalCharges'] / test_clean['tenure']

# Handle potential division by zero for new customers (tenure=0)
train_clean['AvgMonthlySpend'] = train_clean['AvgMonthlySpend'].replace([float('inf'), -float('inf')], 0)
test_clean['AvgMonthlySpend'] = test_clean['AvgMonthlySpend'].replace([float('inf'), -float('inf')], 0)


# 2. TenureGroup
train_clean['TenureGroup'] = pd.cut(train_clean['tenure'], bins=[0, 12, 24, 48, 60, 72], labels=['0-12', '13-24', '25-48', '49-60', '61-72'], right=False)
test_clean['TenureGroup'] = pd.cut(test_clean['tenure'], bins=[0, 12, 24, 48, 60, 72], labels=['0-12', '13-24', '25-48', '49-60', '61-72'], right=False)

# Fill NaN values created by the cut function (for tenure outside bins)
train_clean['TenureGroup'] = train_clean['TenureGroup'].cat.add_categories('Other').fillna('Other')
test_clean['TenureGroup'] = test_clean['TenureGroup'].cat.add_categories('Other').fillna('Other')



train_clean["Churn"] = train_clean["Churn"].map({"Yes": 1, "No": 0})
train_clean.dropna(subset=['Churn'], inplace=True)


print(train_clean["Churn"].value_counts())

8
Churn
0    4131
1    1495
Name: count, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_clean['TotalCharges'].fillna(test_clean['TotalCharges'].mean(), inplace=True)


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Drop ID column (not useful)
X_train = X_train.drop('id', axis=1)
X_test = X_test.drop('id', axis=1)

# Identify categorical columns
categorical_cols = X_train.select_dtypes(include='object').columns.tolist()

# Identify numerical columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# One-hot encode categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'  # Keep other columns as-is
)

# Create a pipeline: Preprocessing + Model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Optional: split training set for local validation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# Train the model
pipeline.fit(X_train_split, y_train_split)

# Evaluate on validation split
y_pred = pipeline.predict(X_val_split)
print(classification_report(y_val_split, y_pred))

# Predict on test set
y_test_pred = pipeline.predict(X_test)

# You can create a submission dataframe if needed:
submission = test_clean[['id']].copy()
submission['Churn'] = y_test_pred
submission['Churn'] = submission['Churn'].map({1: 'Yes', 0: 'No'})  # if needed in original format

# Save to CSV
submission.to_csv("submission.csv", index=False)


NameError: name 'X_train' is not defined

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier # Added VotingClassifier
from sklearn.linear_model import LogisticRegression # Added LogisticRegression


#subsetting
# train_clean_sub = train_clean.drop(["id", "gender", "PhoneService"], axis=1)
# test_clean_sub = test_clean.drop(["id", "gender", "PhoneService"], axis=1)

train_clean_sub = train_clean[["tenure", "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection",
                               "Contract", "MonthlyCharges", "TotalCharges", "StreamingTV", "StreamingMovies", "AvgMonthlySpend", "TenureGroup" ,"Churn"]].copy() # Added new features and .copy()
test_clean_sub = test_clean[["tenure", "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection",
                               "Contract", "MonthlyCharges", "TotalCharges", "StreamingTV", "StreamingMovies", "AvgMonthlySpend", "TenureGroup"]].copy() # Added new features and .copy()

# Replace "No internet service" and "No phone service" with "No"
# cols_to_fix = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
#                'TechSupport', 'StreamingTV', 'StreamingMovies', 'MultipleLines']
cols_to_fix = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                'StreamingMovies', 'MultipleLines']
for col in cols_to_fix:
    train_clean_sub[col] = train_clean_sub[col].replace({'No internet service': 'No', 'No phone service': 'No'})

# Split
X = train_clean_sub.drop("Churn", axis=1)
y = train_clean_sub["Churn"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=43, stratify=y)

# Categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist() # Include 'category' dtype
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
)

# Define base models
xgb = XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

rf = RandomForestClassifier(
    n_estimators=150,
    max_depth=10,
    class_weight='balanced',
    random_state=42
)

lr = LogisticRegression(
    max_iter=1000,
    solver='liblinear'
)

# Combine them into a soft voting classifier
voting_clf = VotingClassifier(
    estimators=[
        ('xgb', xgb),
        ('rf', rf),
        ('lr', lr)
    ],
    voting='soft',  # use predicted probabilities
    n_jobs=-1
)

# Final pipeline: preprocessor + stacked model
pipeline_stack = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', voting_clf) # Use the voting classifier here
])

# Train the stacked model
pipeline_stack.fit(X_train, y_train)

# Evaluate the stacked model
y_pred_stack = pipeline_stack.predict(X_val)
print("🧠 Accuracy (Stacked):", accuracy_score(y_val, y_pred_stack))
print(classification_report(y_val, y_pred_stack))

# Predict on test set using the stacked model
# Preprocess test data
test_clean_sub = test_clean_sub.copy()
test_clean_sub['TotalCharges'] = pd.to_numeric(test_clean_sub['TotalCharges'], errors='coerce')
test_clean_sub['TotalCharges'] = test_clean_sub['TotalCharges'].fillna(test_clean_sub['TotalCharges'].mean())

for col in cols_to_fix:
    test_clean_sub[col] = test_clean_sub[col].replace({'No internet service': 'No', 'No phone service': 'No'})

X_test = test_clean_sub


y_test_pred_stack = pipeline_stack.predict(X_test)

# Save stacked submission
submission_stack = pd.DataFrame({
    'id': test_clean['id'],
    'Churn': ['Yes' if val == 1 else 'No' for val in y_test_pred_stack]
})

submission_stack.to_csv("submission_stacked.csv", index=False)
print("✅ Stacked submission saved as 'submission_stacked.csv'")

🧠 Accuracy (Stacked): 0.7868561278863233
              precision    recall  f1-score   support

           0       0.88      0.82      0.85       827
           1       0.58      0.70      0.64       299

    accuracy                           0.79      1126
   macro avg       0.73      0.76      0.74      1126
weighted avg       0.80      0.79      0.79      1126

✅ Stacked submission saved as 'submission_stacked.csv'


In [None]:
print(submission)

        id Churn
0        0    No
1        1    No
2        2    No
3        3   Yes
4        4    No
...    ...   ...
1404  1404    No
1405  1405    No
1406  1406   Yes
1407  1407    No
1408  1408   Yes

[1409 rows x 2 columns]
