In [6]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Assuming the file is in the current working directory or provide the full path
file_path = '/content/WA_Fn-UseC_-Telco-Customer-Churn.csv'

try:
    df = pd.read_csv(file_path)

    # Data preprocessing
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    df.fillna(0, inplace=True)

    # Convert categorical features to numerical using Label Encoding
    categorical_cols = df.select_dtypes(include=['object']).columns
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le  # Store the encoder for later use

    # Define features (X) and target (y)
    X = df.drop('Churn', axis=1)
    y = df['Churn']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Feature scaling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Model training (Random Forest Classifier)
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Model prediction
    y_pred = model.predict(X_test)

    # Evaluate the model
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
except Exception as e:
    print(f"An error occurred: {e}")


              precision    recall  f1-score   support

           0       0.83      0.91      0.87      1036
           1       0.66      0.50      0.57       373

    accuracy                           0.80      1409
   macro avg       0.75      0.70      0.72      1409
weighted avg       0.79      0.80      0.79      1409

Accuracy: 0.7991483321504613


In [17]:
# data aggregation (call duration, complaints, recharge frequency)

import pandas as pd

# Assuming 'df' is your DataFrame from the previous code
# and it contains columns like 'TotalCharges', 'MonthlyCharges', etc.

# Aggregate data by customer ID (or some identifier)
# Replace 'customerID' with your actual customer ID column name if different
if 'customerID' in df.columns:
    aggregated_data = df.groupby('customerID').agg({
        'TotalCharges': 'sum',  # Total call duration or charges
        'MonthlyCharges': 'mean', # Average monthly charges
        # Add other relevant columns for aggregation
        # For example, to count complaints:
        # 'complaints': 'sum'  # Assuming 'complaints' column represents the number of complaints
        # 'tenure' : 'count' # Recharge frequency (assuming each tenure represents a recharge)

    })
    print(aggregated_data.head())
else:
    print("Error: 'customerID' column not found in the DataFrame.")


            TotalCharges  MonthlyCharges
customerID                              
0                 593.30            65.6
1                 542.40            59.9
2                 280.85            73.9
3                1237.85            98.0
4                 267.40            83.9


In [10]:

!pip install eli5 # install the eli5 package

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import eli5 # Now this import should work
from eli5.sklearn import PermutationImportance

# ... (Your existing code for data loading and preprocessing) ...

# Model training (Random Forest Classifier)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Model prediction
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")


# Use Permutation Importance for feature importance
perm = PermutationImportance(model, random_state=42).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = list(df.drop('Churn', axis=1).columns))

              precision    recall  f1-score   support

           0       0.83      0.91      0.87      1036
           1       0.66      0.50      0.57       373

    accuracy                           0.80      1409
   macro avg       0.75      0.70      0.72      1409
weighted avg       0.79      0.80      0.79      1409

Accuracy: 0.7991483321504613


Weight,Feature
0.0246  ± 0.0097,Contract
0.0143  ± 0.0071,MonthlyCharges
0.0118  ± 0.0119,tenure
0.0070  ± 0.0067,PaperlessBilling
0.0050  ± 0.0058,gender
0.0035  ± 0.0073,TotalCharges
0.0027  ± 0.0046,SeniorCitizen
0.0023  ± 0.0058,TechSupport
0.0017  ± 0.0041,DeviceProtection
0.0017  ± 0.0028,Dependents


In [12]:
#  Create customer segments: At Risk, Loyal, Dormant

import pandas as pd
# Assuming 'df' is your DataFrame and 'y_pred' contains your model predictions

# Assuming y_pred is a NumPy array or a Pandas Series
# Create a new DataFrame with the predictions, indexed to align with the original DataFrame
predictions_df = pd.DataFrame({'Predicted_Churn': y_pred}, index=y_test.index)

# Join the predictions back into the original DataFrame
# to keep all original rows and align predictions
df = df.merge(predictions_df, how='left', left_index=True, right_index=True)

# Calculate spending
# Assuming 'TotalCharges' is the relevant column in the dataframe
df['Spending'] = df['TotalCharges']


def categorize_customer(row):
    if row['Predicted_Churn'] == 1 and row['Spending'] < df['Spending'].quantile(0.25) :
        return 'At Risk'
    elif row['Predicted_Churn'] == 0 and row['Spending'] >= df['Spending'].quantile(0.75):
        return 'Loyal'
    elif row['Predicted_Churn'] == 1 and row['Spending'] >= df['Spending'].quantile(0.75):
      return 'At Risk' #High spending, high churn risk
    elif row['Predicted_Churn'] == 0 and row['Spending'] < df['Spending'].quantile(0.25):
      return 'Dormant' #Low spending, low churn risk
    elif row['Predicted_Churn'] == 0:
        return 'Loyal'
    else:
        return 'At Risk'

df['Customer_Segment'] = df.apply(categorize_customer, axis=1)

print(df[['customerID', 'Predicted_Churn', 'Spending', 'Customer_Segment']].head(20))

    customerID  Predicted_Churn  Spending Customer_Segment
0         5375              NaN     29.85          At Risk
1         3962              NaN   1889.50          At Risk
2         2564              NaN    108.15          At Risk
3         5535              NaN   1840.75          At Risk
4         6511              NaN    151.65          At Risk
5         6551              NaN    820.50          At Risk
6         1002              NaN   1949.40          At Risk
7         4770              NaN    301.90          At Risk
8         5604              1.0   3046.05          At Risk
9         4534              NaN   3487.95          At Risk
10        6871              NaN    587.45          At Risk
11        5288              NaN    326.80          At Risk
12        5751              NaN   5681.10          At Risk
13         174              NaN   5036.30          At Risk
14        3615              0.0   2686.05            Loyal
15        2556              0.0   7895.15            Loy