### Load Data

In [1]:
import pandas as pd
data = pd.read_csv("telco_customer.csv")

### information about the data

In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6938 entries, 0 to 6937
Data columns (total 23 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   customerID                     6938 non-null   object 
 1   gender                         6938 non-null   object 
 2   SeniorCitizen                  6937 non-null   float64
 3   Partner                        6937 non-null   object 
 4   Dependents                     6937 non-null   object 
 5   tenure                         6937 non-null   float64
 6   PhoneService                   6937 non-null   object 
 7   MultipleLines                  6937 non-null   object 
 8   InternetService                6937 non-null   object 
 9   OnlineSecurity                 6937 non-null   object 
 10  OnlineBackup                   6937 non-null   object 
 11  DeviceProtection               6937 non-null   object 
 12  TechSupport                    6937 non-null   o

In [3]:
for col in ['PhoneService', 'MultipleLines', 'InternetService']:
    print(f"\nValue counts for{col}:\n")
    print(data[col].value_counts())


Value counts forPhoneService:

PhoneService
Yes    6273
No      664
Name: count, dtype: int64

Value counts forMultipleLines:

MultipleLines
No                  3348
Yes                 2925
No phone service     664
Name: count, dtype: int64

Value counts forInternetService:

InternetService
Fiber optic    3046
DSL            2379
No             1512
Name: count, dtype: int64


In [4]:
for col in ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport']:
    print(f"\nValue counts for{col}:\n")
    print(data[col].value_counts())


Value counts forOnlineSecurity:

OnlineSecurity
No                     3435
Yes                    1990
No internet service    1512
Name: count, dtype: int64

Value counts forOnlineBackup:

OnlineBackup
No                     3046
Yes                    2379
No internet service    1512
Name: count, dtype: int64

Value counts forDeviceProtection:

DeviceProtection
No                     3041
Yes                    2384
No internet service    1512
Name: count, dtype: int64

Value counts forTechSupport:

TechSupport
No                     3416
Yes                    2009
No internet service    1512
Name: count, dtype: int64


In [5]:
for col in ['StreamingTV', 'StreamingMovies']:
    print(f"\nValue counts for{col}:\n")
    print(data[col].value_counts())


Value counts forStreamingTV:

StreamingTV
No                     2766
Yes                    2659
No internet service    1512
Name: count, dtype: int64

Value counts forStreamingMovies:

StreamingMovies
No                     2733
Yes                    2692
No internet service    1512
Name: count, dtype: int64


In [6]:
print(data['Churn'].value_counts())


Churn
No     5098
Yes    1839
Name: count, dtype: int64


In [7]:
print(data['Contract'].value_counts())

Contract
Month-to-month    3814
Two year          1675
One year          1448
Name: count, dtype: int64


In [8]:
print(data['PaperlessBilling'].value_counts())

PaperlessBilling
Yes    4091
No     2846
Name: count, dtype: int64


In [9]:
print(data['tenure'].mean())

32.349863053193026


In [10]:
print(data['MonthlyCharges'].mean())

64.71835087213493


In [11]:
print(data['customer_sentiment'].value_counts())

customer_sentiment
negative    4069
neutral     1607
positive    1261
Name: count, dtype: int64


In [12]:
print(data['customer_sentiment_confidence'].mean())

0.8929146605160733


### Transform data

In [13]:
data['Dependents'] = data['Dependents'].map({'Yes': 1, 'No': 0})

In [14]:
data['InternetService'] = data['InternetService'].map({'DSL': 1, 'Fiber optic': 2, 'No': 0})
data['Churn'] = data['Churn'].map({'Yes': 1, 'No': 0,'NaN': 0})


In [15]:
data['Contract'] = data['Contract'].apply(lambda x:1 if x == 'Month-to-month' else  0)

In [16]:
data['PaperlessBilling'] = data['PaperlessBilling'].map({'Yes': 1, 'No': 0})

In [17]:
data['customer_sentiment'] = data['customer_sentiment'].map({'positive': 1, 'neutral': 0, 'negative': -1})

In [18]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,customer_sentiment,customer_sentiment_confidence
0,7590-VHVEG,Female,0.0,Yes,0.0,1.0,No,No phone service,1.0,No,...,No,No,1,1.0,Electronic check,29.85,29.85,0.0,0.0,1.0
1,5575-GNVDE,Male,0.0,No,0.0,34.0,Yes,No,1.0,Yes,...,No,No,0,0.0,Mailed check,56.95,1889.5,0.0,1.0,0.3486
2,3668-QPYBK,Male,0.0,No,0.0,2.0,Yes,No,1.0,Yes,...,No,No,1,1.0,Mailed check,53.85,108.15,1.0,0.0,0.6837
3,7795-CFOCW,Male,0.0,No,0.0,45.0,No,No phone service,1.0,Yes,...,No,No,0,0.0,Bank transfer (automatic),42.3,1840.75,0.0,-1.0,1.0
4,9237-HQITU,Female,0.0,No,0.0,2.0,Yes,No,2.0,No,...,No,No,1,1.0,Electronic check,70.7,151.65,1.0,-1.0,1.0


### Show the relationship between the columns

In [19]:
print(data[['InternetService', 'Churn']].corr())

                 InternetService     Churn
InternetService         1.000000  0.317169
Churn                   0.317169  1.000000


In [20]:
print(data[['InternetService', 'Churn']].corr())

                 InternetService     Churn
InternetService         1.000000  0.317169
Churn                   0.317169  1.000000


In [21]:
print(data[['Contract', 'Churn']].corr())

          Contract     Churn
Contract  1.000000  0.405628
Churn     0.405628  1.000000


In [22]:
print(data[['PaperlessBilling', 'Churn']].corr())

                  PaperlessBilling     Churn
PaperlessBilling          1.000000  0.192867
Churn                     0.192867  1.000000


In [23]:
print(data[['customer_sentiment', 'Churn']].corr())

                    customer_sentiment     Churn
customer_sentiment            1.000000 -0.008233
Churn                        -0.008233  1.000000


### predict data 

### using Random Forest Classifier

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Features and target
data = data.dropna(subset=['Churn'])
x= data[['SeniorCitizen','InternetService','Contract', 'customer_sentiment_confidence']]
y = data['Churn']  
#Split data into train/test
X_train, X_test, y_train, y_test = train_test_split(x,y ,test_size=3, random_state= 40)

#Train the Random Forest model
model = RandomForestClassifier(n_estimators = 100, max_depth = 5, random_state = 40)
model.fit(X_train, y_train)

### Evaluate the model

In [26]:
y_pred= model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       1.00      0.50      0.67         2
         1.0       0.50      1.00      0.67         1

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3



### using XGB Classifier

In [27]:
import xgboost 
from xgboost import XGBClassifier

#Train the Random Forest model
model_2= XGBClassifier(n_estimators =100, max_depth= 5, learning_rate=0.1, random_state= 40)
model_2.fit(X_train, y_train)
y_pred_2= model_2.predict(X_test)

### Evaluate the model

In [28]:
print(classification_report(y_test, y_pred_2))

              precision    recall  f1-score   support

         0.0       1.00      0.50      0.67         2
         1.0       0.50      1.00      0.67         1

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3



### Evaluate model performance using AUC-ROC for interpretability

In [29]:
from sklearn.metrics import roc_auc_score,roc_curve

y_proba = model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_proba)
print(f"AUC-ROC Score: {auc_score:.4f}")

AUC-ROC Score: 0.7500


### Custering data 

### Using K-means

In [30]:
from sklearn.cluster import KMeans

data['churn_prob'] = model.predict_proba(x)[:, 1]
Cl= data[['churn_prob']]
m_cluster= KMeans(n_clusters=3, random_state=40)
m_cluster.fit(Cl)


### Customer Segmentation 

In [31]:
#Assign labels to customer segments
segment_names = {0: 'Loyal Promoters', 1: 'At-risk Detractors', 2: 'Passive Neutrals'}

# Add the cluster labels to the original dataframe
data.loc[Cl.index, 'cluster'] = m_cluster.labels_
data['Customer_segment'] = data['cluster'].map(segment_names)

# View segmentation counts
print(data['Customer_segment'].value_counts())

Customer_segment
Loyal Promoters       2471
At-risk Detractors    2253
Passive Neutrals      2213
Name: count, dtype: int64


In [32]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,customer_sentiment,customer_sentiment_confidence,churn_prob,cluster,Customer_segment
0,7590-VHVEG,Female,0.0,Yes,0.0,1.0,No,No phone service,1.0,No,...,1.0,Electronic check,29.85,29.85,0.0,0.0,1.0,0.297934,2.0,Passive Neutrals
1,5575-GNVDE,Male,0.0,No,0.0,34.0,Yes,No,1.0,Yes,...,0.0,Mailed check,56.95,1889.5,0.0,1.0,0.3486,0.083599,0.0,Loyal Promoters
2,3668-QPYBK,Male,0.0,No,0.0,2.0,Yes,No,1.0,Yes,...,1.0,Mailed check,53.85,108.15,1.0,0.0,0.6837,0.312972,2.0,Passive Neutrals
3,7795-CFOCW,Male,0.0,No,0.0,45.0,No,No phone service,1.0,Yes,...,0.0,Bank transfer (automatic),42.3,1840.75,0.0,-1.0,1.0,0.042136,0.0,Loyal Promoters
4,9237-HQITU,Female,0.0,No,0.0,2.0,Yes,No,2.0,No,...,1.0,Electronic check,70.7,151.65,1.0,-1.0,1.0,0.529907,1.0,At-risk Detractors


### Save data as CSV

In [33]:
data.to_csv("segmented_customers.csv", index=False)
