In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import numpy as np

In [3]:
# Load the data
df = pd.read_csv('Telco-Customer-Churn.csv')

# Drop Customer ID
df = df.drop(columns=['customerID'])

In [4]:
# Convert binary columns
binary_columns = {
    'gender': {'Male': 1, 'Female': 0},
    'Partner': {'Yes': 1, 'No': 0},
    'Dependents': {'Yes': 1, 'No': 0},
    'PhoneService': {'Yes': 1, 'No': 0},
    'PaperlessBilling': {'Yes': 1, 'No': 0},
    'Churn': {'Yes': 1, 'No': 0}
}
df.replace(binary_columns, inplace=True)

# Drop rows with any blank or NaN values
df.replace(' ', pd.NA, inplace=True)  # Replace blank strings with NaN
df.dropna(inplace=True)  # Drop rows with any NaN values

# Convert numeric columns to proper data types
numeric_columns = ['tenure', 'MonthlyCharges', 'TotalCharges']
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric)

# Handle target column
y = df['Churn']
X = df.drop(columns=['Churn'])

# Define preprocessing steps
numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
numeric_transformer = StandardScaler()

categorical_features = ['MultipleLines', 'InternetService', 'OnlineSecurity', 
                        'OnlineBackup', 'DeviceProtection', 'TechSupport', 
                        'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']
categorical_transformer = OneHotEncoder(drop='first')  # Drop first to avoid multicollinearity

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'  # Keep other already-processed binary columns as is
)

# Build pipeline with decision tree classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])


  df.replace(binary_columns, inplace=True)


In [5]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.7242359630419332


In [7]:
# Select a single example from X_test for I/O section
sample_index = 2  # You can change this to any index within X_test's range
sample_input = X_test.iloc[sample_index:sample_index+1]

# Convert to dictionary format for better readability
sample_input_dict = sample_input.to_dict(orient='records')[0]

# Predict churn for this single example
sample_prediction = pipeline.predict(sample_input)

# Display the sample input and its prediction
print("Sample input:")
print(sample_input_dict)
print("\nPredicted Churn (1 = Yes, 0 = No):", sample_prediction[0])

# Check the actual outcome to compare
actual_outcome = y_test.iloc[sample_index]
print("\nActual Churn:", actual_outcome)

Sample input:
{'gender': 1, 'SeniorCitizen': 0, 'Partner': 1, 'Dependents': 0, 'tenure': 13, 'PhoneService': 1, 'MultipleLines': 'Yes', 'InternetService': 'Fiber optic', 'OnlineSecurity': 'No', 'OnlineBackup': 'No', 'DeviceProtection': 'Yes', 'TechSupport': 'No', 'StreamingTV': 'Yes', 'StreamingMovies': 'Yes', 'Contract': 'Month-to-month', 'PaperlessBilling': 1, 'PaymentMethod': 'Credit card (automatic)', 'MonthlyCharges': 102.25, 'TotalCharges': 1359.0}

Predicted Churn (1 = Yes, 0 = No): 0

Actual Churn: 1
