# Libraries

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib


# Load and explore data

In [7]:
# Load dataset 
data = pd.read_csv('Telco-Customer-Churn.csv')

# Quick look at data
print(data.head())
print(data.info())


   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

# Basic preprocessing

In [10]:
# Drop non-predictive column
data.drop('customerID', axis=1, inplace=True)

# Encode target column
data['Churn'] = data['Churn'].map({'Yes': 1, 'No': 0})

# Identify numerical and categorical features
numerical_features = data.select_dtypes(include=['int64', 'float64']).drop('Churn', axis=1).columns
categorical_features = data.select_dtypes(include=['object']).columns

print("Numerical features:", list(numerical_features))
print("Categorical features:", list(categorical_features))


Numerical features: ['SeniorCitizen', 'tenure', 'MonthlyCharges']
Categorical features: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'TotalCharges']


# Split data into train and test sets

In [13]:
X = data.drop('Churn', axis=1)
y = data['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)


Training set size: (5634, 19)
Test set size: (1409, 19)


# Build preprocessing pipeline

In [18]:
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])


# Build full pipeline with Logistic Regression

In [21]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])


# Hyperparameter tuning with GridSearchCV

In [24]:
param_grid = {
    'classifier__C': [0.01, 0.1, 1.0, 10.0]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

print("Training Logistic Regression with GridSearchCV...")
grid_search.fit(X_train, y_train)

print("Best parameters found:", grid_search.best_params_)


Training Logistic Regression with GridSearchCV...
Best parameters found: {'classifier__C': 1.0}


# Evaluate on test set

In [27]:
y_pred = grid_search.predict(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Accuracy on test set:", accuracy_score(y_test, y_pred))



Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.91      0.88      1036
           1       0.70      0.60      0.64       373

    accuracy                           0.82      1409
   macro avg       0.78      0.75      0.76      1409
weighted avg       0.82      0.82      0.82      1409

Accuracy on test set: 0.8239886444286728


# Export the trained pipeline

In [30]:
joblib.dump(grid_search.best_estimator_, 'logistic_churn_pipeline.joblib')
print("\n Pipeline exported successfully as logistic_churn_pipeline.joblib")



 Pipeline exported successfully as logistic_churn_pipeline.joblib
