In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from ydata_profiling import ProfileReport


dataset_path = "/Users/bhavanahiremath/Downloads/telco-customer-churn.csv"
dataset = pd.read_csv(dataset_path)
print(f"Dataset loaded successfully with {dataset.shape[0]} rows and {dataset.shape[1]} columns.")

# Replace '?' with NaN and check for missing values
dataset.replace('?', np.nan, inplace=True)
missing_counts = dataset.isnull().sum()
print("Missing values per column:")
print(missing_counts[missing_counts > 0])

numeric_columns = ['tenure', 'MonthlyCharges', 'TotalCharges']
for column in numeric_columns:
    column_data = pd.to_numeric(dataset[column], errors='coerce').dropna()
    z_scores = stats.zscore(column_data)
    outliers = column_data[(z_scores > 2.5) | (z_scores < -2.5)]
    print(f"{column}: {len(outliers)} outliers detected")

# Convert binary columns to numeric
binary_columns = ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
dataset[binary_columns] = dataset[binary_columns].replace({'Yes': 1, 'No': 0, 'Male': 0, 'Female': 1})

# Convert multi-class categorical columns to numeric
categorical_columns = ['InternetService', 'Contract', 'PaymentMethod', 'MultipleLines',
                       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
                       'StreamingTV', 'StreamingMovies']
dataset[categorical_columns] = dataset[categorical_columns].apply(lambda col: col.astype('category').cat.codes)

# Define features and target variable
X = dataset.drop(columns=['customerID', 'Churn'])
y = dataset['Churn']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set: {X_train.shape[0]} samples; Testing set: {X_test.shape[0]} samples.")

# Generate a profile report
profile = ProfileReport(dataset, title="Telco Customer Churn - Pandas Profiling Report", explorative=True)
profile.to_file("dataset_profile_report.html")
print("Profile report generated and saved as 'dataset_profile_report.html'.")

Dataset loaded successfully with 7043 rows and 21 columns.
Missing values per column:
Series([], dtype: int64)
tenure: 0 outliers detected
MonthlyCharges: 0 outliers detected
TotalCharges: 90 outliers detected
Training set: 5634 samples; Testing set: 1409 samples.


  dataset[binary_columns] = dataset[binary_columns].replace({'Yes': 1, 'No': 0, 'Male': 0, 'Female': 1})


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Profile report generated and saved as 'dataset_profile_report.html'.
