In [68]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import hvplot.pandas  # Import hvplot extension for pandas
import matplotlib.pyplot as plt

df = pd.read_csv('C:\\Users\\benny\\Github\\Attrition-forecasting\\data\\customer_churn.csv')

# create new column for tenure in years and round to no decimal places
df['tenure_years'] = round(df['tenure'] / 12, 0)

df.head()

Unnamed: 0,customer_ID,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service,online_security,...,tech_support,streaming_TV,streaming_movies,contract,paperless_billing,payment_method,monthly_charges,total_charges,churn,tenure_years
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,0.0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,No,No,No,One year,No,Mailed check,56.95,1889.5,No,3.0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,0.0
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,4.0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,0.0


In [69]:
# convert these columns to 1 if yes and 0 if anything else:
# 'phone_service', 'internet_service', 'online_security', 'online_backup', 'device_protection', 'tech_support', 'streaming_TV', 'streaming_movies
df['phone_service'] = df['phone_service'].apply(lambda x: 1 if x == 'Yes' else 0)
df['internet_service'] = df['internet_service'].apply(lambda x: 1 if x == 'Fiber optic' else 0)
df['online_security'] = df['online_security'].apply(lambda x: 1 if x == 'Yes' else 0)
df['online_backup'] = df['online_backup'].apply(lambda x: 1 if x == 'Yes' else 0)
df['device_protection'] = df['device_protection'].apply(lambda x: 1 if x == 'Yes' else 0)
df['tech_support'] = df['tech_support'].apply(lambda x: 1 if x == 'Yes' else 0)
df['streaming_TV'] = df['streaming_TV'].apply(lambda x: 1 if x == 'Yes' else 0)
df['streaming_movies'] = df['streaming_movies'].apply(lambda x: 1 if x == 'Yes' else 0)

In [70]:
# create a column called service_count that counts the number of yes's in the following columns:
# 'phone_service', 'internet_service', 'online_security', 'online_backup', 'device_protection', 'tech_support', 'streaming_TV', 'streaming_movies
df['service_count'] = df[['phone_service', 'internet_service', 'online_security', 'online_backup', 'device_protection', 'tech_support', 'streaming_TV', 'streaming_movies']].sum(axis=1)



In [71]:
df.head()

Unnamed: 0,customer_ID,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service,online_security,...,streaming_TV,streaming_movies,contract,paperless_billing,payment_method,monthly_charges,total_charges,churn,tenure_years,service_count
0,7590-VHVEG,Female,0,Yes,No,1,0,No phone service,0,0,...,0,0,Month-to-month,Yes,Electronic check,29.85,29.85,No,0.0,1
1,5575-GNVDE,Male,0,No,No,34,1,No,0,1,...,0,0,One year,No,Mailed check,56.95,1889.5,No,3.0,3
2,3668-QPYBK,Male,0,No,No,2,1,No,0,1,...,0,0,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,0.0,3
3,7795-CFOCW,Male,0,No,No,45,0,No phone service,0,1,...,0,0,One year,No,Bank transfer (automatic),42.3,1840.75,No,4.0,3
4,9237-HQITU,Female,0,No,No,2,1,No,1,0,...,0,0,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,0.0,2


In [73]:
selected_features = df[['phone_service', 'internet_service',
       'online_security', 'online_backup', 'device_protection', 'tech_support',
       'streaming_TV', 'streaming_movies']]

selected_features = pd.get_dummies(selected_features)

selected_features = selected_features.dropna()

# Standardize numerical features
scaler = StandardScaler()
selected_features_scaled = scaler.fit_transform(selected_features)

# Use KMeans for clustering
k = 4  # Adjust this based on the elbow method result
kmeans = KMeans(n_clusters=k, random_state=42)
df['cluster'] = kmeans.fit_predict(selected_features_scaled)

# Plot the clustered data using hvplot.scatter, make the x-axis increment by 0.5
plot = df.hvplot.scatter(y='tenure', x='tenure_years', by='cluster', hover_cols=['customer_id'], alpha=0.5, height=400, xticks=3)

# Show the plot
plot

