### Telco Customer Churn Prediction Project

### Stage 1: Data Collection & Cleaning

In [10]:
import pandas as pd
import numpy as np

### 1. Data Collection

In [11]:
try:
    df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
    print(f"Successfully loaded dataset with {df.shape[0]} rows and {df.shape[1]} columns.\n")
except FileNotFoundError:
    print("Error: 'WA_Fn-UseC_-Telco-Customer-Churn.csv' not found. Please ensure the file is in the correct directory.")
    # Creating a dummy DataFrame to allow the script to proceed for demonstration purposes
    # In a real scenario, the script would exit here.
    data = {
        'customerID': [7590, 5575, 4190],
        'gender': ['Female', 'Male', 'Female'],
        'SeniorCitizen': [0, 0, 0],
        'Partner': ['Yes', 'No', 'No'],
        'Dependents': ['No', 'No', 'No'],
        'tenure': [1, 34, 2],
        'PhoneService': ['No', 'Yes', 'Yes'],
        'MultipleLines': ['No phone service', 'No', 'No'],
        'InternetService': ['DSL', 'DSL', 'DSL'],
        'OnlineSecurity': ['No', 'Yes', 'Yes'],
        'OnlineBackup': ['Yes', 'No', 'Yes'],
        'DeviceProtection': ['No', 'Yes', 'No'],
        'TechSupport': ['No', 'No', 'No'],
        'StreamingTV': ['No', 'No', 'No'],
        'StreamingMovies': ['No', 'No', 'No'],
        'Contract': ['Month-to-month', 'One year', 'Month-to-month'],
        'PaperlessBilling': ['Yes', 'No', 'Yes'],
        'PaymentMethod': ['Electronic check', 'Mailed check', 'Mailed check'],
        'MonthlyCharges': [29.85, 56.95, 53.85],
        'TotalCharges': ['29.85', '1889.5', '108.15'],
        'Churn': ['No', 'No', 'Yes']
    }
    df = pd.DataFrame(data)

Successfully loaded dataset with 7043 rows and 21 columns.



In [12]:
print("--- Initial Data Structure (df.info()) ---\n")
df.info()
print("\n")

--- Initial Data Structure (df.info()) ---

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16

### 2. Data Cleaning: Identifying and Resolving Issues

In [13]:
# 2.1. Handling Inconsistent Data Types (TotalCharges)
print("--- 2.1. Handling 'TotalCharges' Data Type Issue ---\n")

# The initial info() shows 'TotalCharges' is an object (string), but it should be numeric.
# I used pd.to_numeric with errors='coerce' to turn non-numeric strings (likely spaces or hyphens) into NaN.
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

print("After attempted conversion, checking for newly introduced missing values")
missing_values_count = df['TotalCharges'].isnull().sum()
print(f"Total NaN values found in 'TotalCharges': {missing_values_count}\n")

--- 2.1. Handling 'TotalCharges' Data Type Issue ---

After attempted conversion, checking for newly introduced missing values
Total NaN values found in 'TotalCharges': 11



In [14]:
# Analysis of missing 'TotalCharges':
# These 11 missing values typically correspond to customers with a 'tenure' of 0 (new customers)
# who have not yet accumulated any total charges.


# 2.2. Handling Missing Values
print("\n--- 2.2. Handling Missing Values in 'TotalCharges' ---\n")

if missing_values_count > 0:
    # Professional Decision: Given that the number of missing rows is very small (around 0.15% of the data),
    # and they represent new customers (tenure=0), dropping them is the safest option.
    # Imputing '0' could introduce bias, and dropping a few rows is better than corrupting the feature.
    print(f"Dropping {missing_values_count} rows where 'TotalCharges' is NaN (these are new customers with tenure 0).")
    df.dropna(subset=['TotalCharges'], inplace=True)
    print(f"Dataset size after dropping NaN rows: {df.shape[0]} rows.")
else:
    print("No missing values detected in 'TotalCharges' after conversion. No rows dropped.")

# Verify that TotalCharges is now numeric
print("\nVerifying 'TotalCharges' dtype after cleanup:\n")
df.info()


--- 2.2. Handling Missing Values in 'TotalCharges' ---

Dropping 11 rows where 'TotalCharges' is NaN (these are new customers with tenure 0).
Dataset size after dropping NaN rows: 7032 rows.

Verifying 'TotalCharges' dtype after cleanup:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   object 
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   obj

In [15]:
print("\n--- 2.3. Handling Irrelevant Columns (customerID) ---\n")

# The 'customerID' is a unique identifier and holds no predictive power for the model.
df.drop('customerID', axis=1, inplace=True)
print("Dropped 'customerID' column. Columns remaining:", df.shape[1])


--- 2.3. Handling Irrelevant Columns (customerID) ---

Dropped 'customerID' column. Columns remaining: 20


In [16]:
print("\n--- 2.4. Correcting Target Variable Encoding ('Churn') ---\n")

# The target variable 'Churn' should be numeric (0 or 1) for modeling:
df['Churn'] = df['Churn'].replace({'Yes': 1, 'No': 0})
print("Target variable 'Churn' converted from {'Yes', 'No'} to {1, 0}.\n")
print(f"Churn value counts:\n{df['Churn'].value_counts()}")


--- 2.4. Correcting Target Variable Encoding ('Churn') ---

Target variable 'Churn' converted from {'Yes', 'No'} to {1, 0}.

Churn value counts:
0    5163
1    1869
Name: Churn, dtype: int64


In [17]:
# 2.5. Final Data Structure
print("\n--- Final Data Structure Check (Post-Cleaning) ---\n")
print("First 5 rows of the cleaned DataFrame:\n")
print(df.head())
print("\nFinal column types:\n")
df.info()


--- Final Data Structure Check (Post-Cleaning) ---

First 5 rows of the cleaned DataFrame:

   gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  Female              0     Yes         No       1           No   
1    Male              0      No         No      34          Yes   
2    Male              0      No         No       2          Yes   
3    Male              0      No         No      45           No   
4  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity OnlineBackup  \
0  No phone service             DSL             No          Yes   
1                No             DSL            Yes           No   
2                No             DSL            Yes          Yes   
3  No phone service             DSL            Yes           No   
4                No     Fiber optic             No           No   

  DeviceProtection TechSupport StreamingTV StreamingMovies        Contract  \
0               N

In [18]:
# Save the cleaned DataFrame
df.to_csv('cleaned_telco_churn_data.csv', index=False)
print("\nCleaned data saved to 'cleaned_telco_churn_data.csv'")



Cleaned data saved to 'cleaned_telco_churn_data.csv'
