### IMPORT AND CLEAN THE DATA

In [1]:
# import pandas and read data file
import pandas as pd
telco_raw = pd.read_csv("telco.csv")

In [2]:
# print header of telco dataset
telco_raw.head(3)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,4472-LVYGI,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,...,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,52.55,No
1,3115-CZMZD,Male,0,No,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.25,20.25,No
2,5709-LVOEQ,Female,0,Yes,Yes,0,Yes,No,DSL,Yes,...,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,80.85,No


In [3]:
# Print the data types of telco_raw dataset
print(telco_raw.dtypes)

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object


In [4]:
# Print the number of unique values in each telco_raw column
telco_raw.nunique()

customerID          7043
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1585
TotalCharges        6534
Churn                  2
dtype: int64

In [5]:
# Store customerID and Churn column names
custid = ['customerID']
target = ['Churn']

# Store categorical column names
categorical = telco_raw.nunique()[telco_raw.nunique() < 10].keys().tolist()

# Remove target from the list of categorical variables
categorical.remove(target[0])

# Store numerical column names
numerical = [x for x in telco_raw.columns if x not in custid + target + categorical]

In [6]:
# Perform one-hot encoding to categorical variables 
telco_raw = pd.get_dummies(data = telco_raw, columns = categorical, drop_first=True)

In [7]:
telco_raw.head()

Unnamed: 0,customerID,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,SeniorCitizen_1,Partner_Yes,Dependents_Yes,PhoneService_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,4472-LVYGI,0,52.55,52.55,No,0,0,1,1,0,...,0,1,0,0,0,1,1,0,0,0
1,3115-CZMZD,0,20.25,20.25,No,1,0,0,1,1,...,1,0,1,0,0,1,0,0,0,1
2,5709-LVOEQ,0,80.85,80.85,No,0,0,1,1,1,...,0,1,0,1,0,1,0,0,0,1
3,4367-NUYAO,0,25.75,25.75,No,1,0,1,1,1,...,1,0,1,0,0,1,0,0,0,1
4,1371-DWPAZ,0,56.05,56.05,No,0,0,1,1,0,...,0,1,0,0,0,1,0,1,0,0


In [8]:
import numpy as np
telco_raw['TotalCharges'] = telco_raw['TotalCharges'].astype(np.float64)

In [9]:
# usually this is good for classification
from sklearn.preprocessing import StandardScaler
# Initialize StandardScaler instance
scaler = StandardScaler()

# Fit and transform the scaler on numerical columns
scaled_numerical = scaler.fit_transform(telco_raw[numerical])

# Build a DataFrame from scaled_numerical
scaled_numerical = pd.DataFrame(scaled_numerical, columns=numerical)

In [10]:
scaled_numerical.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,-1.318165,-0.405867,-0.982652
1,-1.318165,-1.479388,-0.996903
2,-1.318165,0.53471,-0.970166
3,-1.318165,-1.29659,-0.994476
4,-1.318165,-0.289541,-0.981108


In [11]:
# Print the unique Churn values
print(set(telco_raw['Churn']))

{'No', 'Yes'}


In [12]:
telco_raw.groupby(['Churn']).size()

Churn
No     5174
Yes    1869
dtype: int64

In [13]:
# Calculate the ratio size of each churn group
telco_raw.groupby(['Churn']).size() / telco_raw.shape[0] * 100

Churn
No     73.463013
Yes    26.536987
dtype: float64

### TRAIN TEST AND SPLIT 

In [14]:
# Import the function for splitting data to train and test
from sklearn.model_selection import train_test_split

# Split the data into train and test
train, test = train_test_split(telco_raw, test_size = .25)

In [15]:
# Store column names from `telcom` excluding target variable and customer ID
cols = [col for col in telco_raw.columns if col not in custid + target]

# Extract training features
train_X = train[cols]

# Extract training target
train_Y = train[target]

# Extract testing features
test_X = test[cols]

# Extract testing target
test_Y = test[target]

### LOGISTIC REGRESSION ON CHURN

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

logreg = LogisticRegression()

# Fit logistic regression on training data
logreg.fit(train_X, train_Y)

# Predict churn labels on testing data
pred_test_Y = logreg.predict(test_X)

# Calculate accuracy score on testing data
test_accuracy = accuracy_score(test_Y, pred_test_Y)

# Print test accuracy score rounded to 4 decimals
print('Test accuracy:', round(test_accuracy, 4))

Test accuracy: 0.8047


  y = column_or_1d(y, warn=True)


In [17]:
# Initialize logistic regression instance 
logreg = LogisticRegression(penalty='l1', C=0.025, solver='liblinear')

# Fit the model on training data
logreg.fit(train_X, train_Y)

# Predict churn values on test data
pred_test_Y = logreg.predict(test_X)

# Print the accuracy score on test data
print('Test accuracy:', round(accuracy_score(test_Y, pred_test_Y), 4))

Test accuracy: 0.8018


  y = column_or_1d(y, warn=True)


### HYPERTUNING THE MODEL

In [18]:
C = [1, .5, .25, .1, .05, .025, .01, .005, .0025]
l1_metrics = np.zeros((len(C), 5))
l1_metrics[:,0] = C

In [19]:
from sklearn.metrics import recall_score, precision_score
# Model Tuning
# Run a for loop over the range of C list length
for index in range(0, len(C)):
    # Initialize and fit Logistic Regression with the C candidate
    logreg = LogisticRegression(penalty='l1', C=C[index], solver='liblinear')
    logreg.fit(train_X, train_Y)
    # Predict churn on the testing data
    pred_test_Y = logreg.predict(test_X)
    # Create non-zero count and recall score columns
    l1_metrics[index,1] = np.count_nonzero(logreg.coef_)
    l1_metrics[index,2] = recall_score(test_Y, pred_test_Y, pos_label='Yes')
    l1_metrics[index,3] = precision_score(test_Y, pred_test_Y, pos_label='Yes')
    l1_metrics[index,4] = accuracy_score(test_Y, pred_test_Y)

# Name the columns and print the array as pandas DataFrame
col_names = ['C','Non-Zero Coeffs','Recall','Precision','Accuracy']
print(pd.DataFrame(l1_metrics, columns=col_names))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


        C  Non-Zero Coeffs    Recall  Precision  Accuracy
0  1.0000             30.0  0.580169   0.648585  0.802385
1  0.5000             29.0  0.578059   0.649289  0.802385
2  0.2500             29.0  0.575949   0.653110  0.803521
3  0.1000             23.0  0.569620   0.666667  0.807496
4  0.0500             22.0  0.550633   0.670951  0.806360
5  0.0250             18.0  0.521097   0.669377  0.801817
6  0.0100              9.0  0.512658   0.653226  0.795571
7  0.0050              3.0  0.527426   0.598086  0.777399
8  0.0025              3.0  0.523207   0.599034  0.777399


In [20]:
pd.DataFrame(l1_metrics, columns=col_names)

Unnamed: 0,C,Non-Zero Coeffs,Recall,Precision,Accuracy
0,1.0,30.0,0.580169,0.648585,0.802385
1,0.5,29.0,0.578059,0.649289,0.802385
2,0.25,29.0,0.575949,0.65311,0.803521
3,0.1,23.0,0.56962,0.666667,0.807496
4,0.05,22.0,0.550633,0.670951,0.80636
5,0.025,18.0,0.521097,0.669377,0.801817
6,0.01,9.0,0.512658,0.653226,0.795571
7,0.005,3.0,0.527426,0.598086,0.777399
8,0.0025,3.0,0.523207,0.599034,0.777399


#### C = 0.025 YIELDS THE BEST RESULTS SINCE IT HAS HIGH ACCURACY, PRECISION AND LOW NON-ZERO COEFFS

### LOGISTIC REGRESSION INTERPRETATING THE MODEL

In [21]:
# Combine feature names and coefficients into pandas DataFrame
feature_names = pd.DataFrame(train_X.columns, columns = ['Feature'])
log_coef = pd.DataFrame(np.transpose(logreg.coef_), columns = ['Coefficient'])
coefficients = pd.concat([feature_names, log_coef], axis = 1)

# Calculate exponent of the logistic regression coefficients
coefficients['Exp_Coefficient'] = np.exp(coefficients['Coefficient'])

# Remove coefficients that are equal to zero
coefficients = coefficients[coefficients['Coefficient']!=0]

# Print the values sorted by the exponent coefficient
print(coefficients.sort_values(by=['Exp_Coefficient']))

          Feature  Coefficient  Exp_Coefficient
0          tenure    -0.103999         0.901226
2    TotalCharges     0.000636         1.000636
1  MonthlyCharges     0.007605         1.007634


Using the logistic regression model, we can use tenure along with total charges and monthly charges to predict our outcomes. We can see the longer the tenure is, the less likely our clients will churn. At the same time, Charging higher amounts on a monthly basis will contribute to customers churning.

### DECISION TREE

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize decision tree classifier
mytree = DecisionTreeClassifier()

# Fit the decision tree on training data
mytree.fit(train_X, train_Y)

# Predict churn labels on testing data
pred_test_Y = mytree.predict(test_X)

# Calculate accuracy score on testing data
test_accuracy = accuracy_score(test_Y, pred_test_Y)

# Print test accuracy
print('Test accuracy:', round(test_accuracy, 4))

In [None]:
depth_list = list(range(2,15))
depth_tuning = np.zeros((len(depth_list), 4))
depth_tuning[:,0] = depth_list

# Run a for loop over the range of depth list length
for index in range(0, len(depth_list)):
  # Initialize and fit decision tree with the `max_depth` candidate
  mytree = DecisionTreeClassifier(max_depth=depth_list[index])
  mytree.fit(train_X, train_Y)
  # Predict churn on the testing data
  pred_test_Y = mytree.predict(test_X)
  # Calculate the recall score 
  depth_tuning[index,1] = accuracy_score(test_Y, pred_test_Y)
  depth_tuning[index,2] = precision_score(test_Y, pred_test_Y, pos_label='Yes')
  depth_tuning[index,3] = recall_score(test_Y, pred_test_Y, pos_label='Yes')


# Name the columns and print the array as pandas DataFrame
col_names = ['Max_Depth','Accuracy','Precision','Recall']

pd.DataFrame(depth_tuning, columns=col_names)

#### Max_depth of 6.0 is the best due to good accuracy, precision and high recall

In [None]:
# to make the visulization a bit easier, we only set max_depth as 2, however it should be 6 in this situation
mytree = DecisionTreeClassifier(max_depth = 2)
mytree.fit(train_X, train_Y)

In [None]:
from sklearn import tree
import graphviz
exported = tree.export_graphviz(
    decision_tree=mytree,
    out_file=None,
    feature_names=cols,
    precision=1,
    class_names=['Not churn',
                 'Churn'],
    filled = True)
graph = graphviz.Source(exported)
display(graph)

Tenure <= 17.5 with internetservice_fiber_optic <= 0.5 are likely to not churn.\
Tenure > 17.5 and InternetService Fiber Optic > 0.5 are likely to not churn.\
To conclude, in this model, customers churn depends on whether they are offered fiber optic services.\
If they are have fiber optic then they are not likely to churn.

#### ANALYTICS AND CONCLUSIONS
So based on the models we built. The best way to prevent customers from churning is to lowerly monthly charges for\
internet services, periodically give discounts to lower Total charges and introduce high speed internet through Fiber
Optic services.