In [1]:
import pandas as pd
import numpy as np

In [2]:
telco_data = pd.read_csv('Telco-Customer-Churn.csv')

In [3]:
telco_data.shape

(5634, 21)

In [4]:
telco_data.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,5634.0,5634.0,5634.0
mean,0.160809,32.373092,64.864253
std,0.367388,24.424539,30.089324
min,0.0,0.0,18.25
25%,0.0,9.0,35.75
50%,0.0,29.0,70.525
75%,0.0,55.0,89.9375
max,1.0,72.0,118.6


In [5]:
telco_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5634 entries, 0 to 5633
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Churn             5634 non-null   object 
 1   customerID        5634 non-null   object 
 2   gender            5634 non-null   object 
 3   SeniorCitizen     5634 non-null   int64  
 4   Partner           5634 non-null   object 
 5   Dependents        5634 non-null   object 
 6   tenure            5634 non-null   int64  
 7   PhoneService      5634 non-null   object 
 8   MultipleLines     5634 non-null   object 
 9   InternetService   5634 non-null   object 
 10  OnlineSecurity    5634 non-null   object 
 11  OnlineBackup      5634 non-null   object 
 12  DeviceProtection  5634 non-null   object 
 13  TechSupport       5634 non-null   object 
 14  StreamingTV       5634 non-null   object 
 15  StreamingMovies   5634 non-null   object 
 16  Contract          5634 non-null   object 


### Alcuni valori categorici delle colonne che non hanno solo Yes/No.

Le colonne OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV, StreamingMovies hanno tutte valori Yes/No/No internet service

In [6]:
pd.unique(telco_data.MultipleLines)

array(['No', 'Yes', 'No phone service'], dtype=object)

In [7]:
pd.unique(telco_data.InternetService)

array(['DSL', 'Fiber optic', 'No'], dtype=object)

In [8]:
pd.unique(telco_data.Contract)

array(['One year', 'Two year', 'Month-to-month'], dtype=object)

In [9]:
pd.unique(telco_data.PaymentMethod)

array(['Mailed check', 'Bank transfer (automatic)', 'Electronic check',
       'Credit card (automatic)'], dtype=object)

La colonna TotalCharges ha dei valori vuoti (solo 10, conviene cancellare le righe rispetto a tutta la colonna)

In [10]:
telco_data.TotalCharges.head()

0     1336.8
1    5129.45
2      23.45
3     237.95
4           
Name: TotalCharges, dtype: object

In [11]:
(telco_data.TotalCharges.values == ' ').sum()

10

Rimpiazzo ' ' con NaN e elimino le righe corrispondenti, convertendo poi la colonna TotalCharges in float

In [12]:
telco_data['TotalCharges'].replace(' ', np.nan, inplace=True)

In [13]:
telco_data = telco_data.dropna()

In [14]:
telco_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5624 entries, 0 to 5633
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Churn             5624 non-null   object 
 1   customerID        5624 non-null   object 
 2   gender            5624 non-null   object 
 3   SeniorCitizen     5624 non-null   int64  
 4   Partner           5624 non-null   object 
 5   Dependents        5624 non-null   object 
 6   tenure            5624 non-null   int64  
 7   PhoneService      5624 non-null   object 
 8   MultipleLines     5624 non-null   object 
 9   InternetService   5624 non-null   object 
 10  OnlineSecurity    5624 non-null   object 
 11  OnlineBackup      5624 non-null   object 
 12  DeviceProtection  5624 non-null   object 
 13  TechSupport       5624 non-null   object 
 14  StreamingTV       5624 non-null   object 
 15  StreamingMovies   5624 non-null   object 
 16  Contract          5624 non-null   object 


In [15]:
telco_data['TotalCharges'] = telco_data['TotalCharges'].astype(float)

In [16]:
telco_data.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
count,5624.0,5624.0,5624.0,5624.0
mean,0.161095,32.430654,64.91162,2291.154605
std,0.367652,24.408029,30.081601,2263.151534
min,0.0,1.0,18.25,18.8
25%,0.0,9.0,35.8,411.5125
50%,0.0,29.0,70.55,1410.8
75%,0.0,55.0,89.95,3808.85
max,1.0,72.0,118.6,8684.8


I customerID hanno tutti valori diversi, che sembrano assegnati in modo randomico e quindi privi di informazioni utilizzabili per la classificazione.

Anche separando la stringa tipo, che ha formato "XXXX-YYYYY" con X digit intero e Y carattere maiuscolo, ci sono molti più valori che ognuna delle due sottostringhe possono assumere rispetto alle righe del dataset.

In [17]:
telco_data['customerID'].nunique()

5624

In [18]:
telco_data = telco_data.drop(columns='customerID')

In [22]:
telco_data['MonthlyCharges'].nunique()

1494

In [23]:
telco_data['TotalCharges'].nunique()

5299

I valori di Tenure sono spesso simili tra loro e potrebbero essere utili da analizzare

In [24]:
telco_data['tenure'].nunique()

72

Risulta utile convertire le colonne che hanno Yes/No/Other con del label categorici interi (per compatibilità con tutti gli algoritmi che potrebbero essere usati).

Uso di un LabelEncoder di sklearn

In [25]:
telco_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5624 entries, 0 to 5633
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Churn             5624 non-null   object 
 1   gender            5624 non-null   object 
 2   SeniorCitizen     5624 non-null   int64  
 3   Partner           5624 non-null   object 
 4   Dependents        5624 non-null   object 
 5   tenure            5624 non-null   int64  
 6   PhoneService      5624 non-null   object 
 7   MultipleLines     5624 non-null   object 
 8   InternetService   5624 non-null   object 
 9   OnlineSecurity    5624 non-null   object 
 10  OnlineBackup      5624 non-null   object 
 11  DeviceProtection  5624 non-null   object 
 12  TechSupport       5624 non-null   object 
 13  StreamingTV       5624 non-null   object 
 14  StreamingMovies   5624 non-null   object 
 15  Contract          5624 non-null   object 
 16  PaperlessBilling  5624 non-null   object 


In [26]:
# Categorical boolean mask
categorical_feature_mask = telco_data.dtypes==object
# filter categorical columns using mask and turn it into a list
categorical_cols = telco_data.columns[categorical_feature_mask].tolist()

In [28]:
print(categorical_cols)

['Churn', 'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


In [29]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [30]:
# apply le on categorical feature columns
telco_data[categorical_cols] = telco_data[categorical_cols].apply(lambda col: le.fit_transform(col))
telco_data[categorical_cols].head(10)

Unnamed: 0,Churn,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod
0,0,0,0,1,1,0,0,2,0,2,0,0,2,1,0,3
1,0,0,0,0,1,2,1,0,2,0,0,2,2,2,1,0
2,1,1,1,0,0,1,0,0,0,0,0,0,0,0,1,2
3,1,1,0,0,1,0,1,0,0,0,0,0,0,0,1,2
5,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,2
6,0,1,1,1,1,0,2,1,1,1,1,1,1,2,0,3
7,0,1,1,1,1,0,2,1,1,1,1,1,1,2,1,1
8,1,1,0,0,0,1,0,0,0,2,0,2,0,0,1,2
9,0,1,0,1,1,0,2,1,1,1,1,1,1,1,1,0
10,1,1,1,0,1,0,1,0,0,0,0,0,0,0,1,2


Churn aveva valori Yes/No ed ora ha valori 1/0

gender ha M/F mappati su 1/0

Al massimo abbiamo 4 classi sulla feature categorica PaymentMethod

In [32]:
telco_data.shape

(5624, 20)

In [33]:
from sklearn.model_selection import train_test_split

X = telco_data.drop(['Churn'], axis=1)
y = telco_data['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, train_size=0.7, test_size=0.3)

In [47]:
from sklearn.tree import DecisionTreeClassifier

max_loop_depth = 40
max_score = 0
best_depth = 0

for depth in range(1, max_loop_depth):
    model = DecisionTreeClassifier(max_depth=depth, random_state=1)
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    if(score > max_score):
        max_score = score
        best_depth = depth
    
best_model = DecisionTreeClassifier(max_depth=best_depth, random_state=1)
best_model.fit(X_train, y_train)

print('Best DecisionTree depth: ', best_depth)
print('Training Accuracy: ', best_model.score(X_train, y_train) * 100, '%')
print('Testing Accuracy: ', best_model.score(X_test, y_test) * 100, '%')

Best DecisionTree depth:  5
Training Accuracy:  80.25914634146342 %
Testing Accuracy:  77.72511848341233 %
