In [1]:
!pip install pandas numpy -q

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [5]:
# # Step 2: Load Dataset
# url = "https://raw.githubusercontent.com/blastchar/telco-customer-churn/master/WA_Fn-UseC_-Telco-Customer-Churn.csv"
# df = pd.read_csv(url)

# df.head()

# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "WA_Fn-UseC_-Telco-Customer-Churn.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "blastchar/telco-customer-churn",
  file_path,
  # Provide any additional arguments like
  # sql_query or pandas_kwargs. See the
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

print("First 5 records:", df.head())

First 5 records:    customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies    

In [6]:
def clean_data(df):
    df = df.copy()

    # Convert TotalCharges to numeric and impute missing with median
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

    # Drop customerID
    df = df.drop(columns=['customerID'], errors='ignore')

    return df


In [7]:
def engineer_features(df):
    df = df.copy()

    # Create tenure_group: every 12 months = 1 year
    bins = [0, 12, 24, 36, 48, 60, np.inf]
    labels = ['0-12','13-24','25-36','37-48','49-60','60+']
    df['tenure_group'] = pd.cut(df['tenure'], bins=bins, labels=labels, right=True)

    # Compute average monthly charges
    df['avg_charges_per_month'] = df['TotalCharges'] / df['tenure'].replace(0, np.nan)
    df['avg_charges_per_month'] = df['avg_charges_per_month'].fillna(0)

    # One-hot encode categorical features
    df = pd.get_dummies(df, columns=['Contract', 'PaymentMethod'], drop_first=True)

    return df


In [8]:
def validate_data(df):
    errors = []

    # Check for infinite values
    if np.isinf(df['avg_charges_per_month']).any():
        errors.append("Infinite values found in avg_charges_per_month.")

    # Check all tenure_group labels are represented
    expected_groups = {'0-12','13-24','25-36','37-48','49-60','60+'}
    actual_groups = set(df['tenure_group'].astype(str).unique())
    if not expected_groups.issubset(actual_groups):
        missing = expected_groups - actual_groups
        errors.append(f"Missing tenure_group labels: {missing}")

    return errors


In [9]:
# Execute pipeline
df_clean = clean_data(df)
df_feat = engineer_features(df_clean)
validation_errors = validate_data(df_feat)

print("Validation Errors:", validation_errors or "None")


Validation Errors: None


In [10]:
df_feat.to_csv("telco_churn_prepared.csv", index=False)
