# Data Preparation

**Import models**

In [1]:
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier 

**Load and preview data**

In [2]:
df = pd.read_csv('telecom_churn_data.csv')
df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


Observations
- each row is a customer
- target for prediciton is churn 

In [3]:
#replace the space in the column header with _
df.columns = df.columns.str.replace(' ', '_')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 21 columns):
state                     3333 non-null object
account_length            3333 non-null int64
area_code                 3333 non-null int64
phone_number              3333 non-null object
international_plan        3333 non-null object
voice_mail_plan           3333 non-null object
number_vmail_messages     3333 non-null int64
total_day_minutes         3333 non-null float64
total_day_calls           3333 non-null int64
total_day_charge          3333 non-null float64
total_eve_minutes         3333 non-null float64
total_eve_calls           3333 non-null int64
total_eve_charge          3333 non-null float64
total_night_minutes       3333 non-null float64
total_night_calls         3333 non-null int64
total_night_charge        3333 non-null float64
total_intl_minutes        3333 non-null float64
total_intl_calls          3333 non-null int64
total_intl_charge         3333 non-null float64

Observations:
- no null values

**Create Target and Feature variables**

In [4]:
#Target variable
y = df.churn 

#df of features
x = df.drop("churn", axis=1)

In [5]:
#drop the phone number column
x = pd.get_dummies(x, drop_first = True)

In [6]:
x.head()

Unnamed: 0,account_length,area_code,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,...,phone_number_422-5874,phone_number_422-6685,phone_number_422-6690,phone_number_422-7728,phone_number_422-8268,phone_number_422-8333,phone_number_422-8344,phone_number_422-9964,international_plan_yes,voice_mail_plan_yes
0,128,415,25,265.1,110,45.07,197.4,99,16.78,244.7,...,0,0,0,0,0,0,0,0,0,1
1,107,415,26,161.6,123,27.47,195.5,103,16.62,254.4,...,0,0,0,0,0,0,0,0,0,1
2,137,415,0,243.4,114,41.38,121.2,110,10.3,162.6,...,0,0,0,0,0,0,0,0,0,0
3,84,408,0,299.4,71,50.9,61.9,88,5.26,196.9,...,0,0,0,0,0,0,0,0,1,0
4,75,415,0,166.7,113,28.34,148.3,122,12.61,186.9,...,0,0,0,0,0,0,0,0,1,0


**Splitting data into training set and testing set **

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.25, random_state = 15)

# Modelling

**Model 1: Single Decision Tree**

In [8]:
#create instance of decision tree object with default parameters
dtc = DecisionTreeClassifier(max_depth = 5)

#fit the model to the training data
dtc.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [9]:
#calculate accuracy for for training data
print(accuracy_score(y_train, dtc.predict(x_train)))

0.9563825530212084


In [11]:
#calculate accuracy score for test data
print(accuracy_score(y_test, dtc.predict(x_test)))

0.935251798561151


In [12]:
#calculate recall score for train data
print(recall_score(y_train, dtc.predict(x_train)))

0.7416666666666667


In [15]:
#calculate recall score for test data 
print(recall_score(y_test, dtc.predict(x_test)))

0.6504065040650406


**Model 2: Random Forest**

In [16]:
#create instance of random forrest classifier object
rft = RandomForestClassifier(n_estimators = 100, max_depth = 5)

#fir the model to the training data
rft.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [17]:
#calculate accuracy score for training data

print(accuracy_score(y_train, rft.predict(x_train)))

0.8559423769507803


In [18]:
#calculate accuracy score for test data
print (accuracy_score(y_test, rft.predict(x_test)))

0.8525179856115108


In [19]:
#check recall score for train data
print(recall_score(y_train, rft.predict(x_train)))

0.0


In [21]:
#check recall score for test data
print(recall_score(y_test, rft.predict(x_test)))

0.0


**Model 3: AdaBoosting**

In [23]:
#Initialize adabost classifier object
abc = AdaBoostClassifier(random_state = 15)


In [24]:
#for the model to the training data
abc.fit(x_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=15)

In [25]:
#Calculate accuracy score for training data
print(accuracy_score(y_train, abc.predict(x_train)))

0.8947579031612645


In [26]:
#Calculate accuracy score for test data
print(accuracy_score(y_test, abc.predict(x_test)))

0.8693045563549161


In [27]:
#calculate recall score for training data
print(recall_score(y_train, abc.predict(x_train)))

0.4583333333333333


In [28]:
#calculate recall score for test data
print(recall_score(y_test, abc.predict(x_test)))

0.3170731707317073


**Model 4: Gradient Boosting**

In [29]:
#initialize gradient boost classifier object
gbc = GradientBoostingClassifier(random_state = 15)

In [31]:
#fit the model to the training data
gbc.fit(x_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=15, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [32]:
#calculate accuracy for training data
print(accuracy_score(y_train, gbc.predict(x_train)))

0.959983993597439


In [34]:
#calculate accuracy score for test data:
print(accuracy_score(y_test, gbc.predict(x_test)))

0.9508393285371702


In [35]:
#calculate recall score for training data
print(recall_score(y_train, gbc.predict(x_train)))

0.7416666666666667


In [37]:
#calculate recall score for test data
print(recall_score(y_test, gbc.predict(x_test)))

0.7154471544715447


# Other observations

In [38]:
y.value_counts()

False    2850
True      483
Name: churn, dtype: int64

86% of the data is false, so the models can be biased towards false 