## Contact Center (Decision Tree)

In [1]:
#Import Data
import pandas as pd
survey_data = pd.read_csv(r'Datasets/Call_center_survey.csv')

#total number of customers
print(survey_data.shape)

#Column names
print(survey_data.columns)

#Print Sample data
pd.set_option('display.max_columns', None) #This option displays all the columns 

survey_data.head()

(12330, 7)
Index(['Cust_id', 'Age', 'Account_balance', 'Personal_loan_ind',
       'Home_loan_ind', 'Prime_Customer_ind', 'Overall_Satisfaction'],
      dtype='object')


Unnamed: 0,Cust_id,Age,Account_balance,Personal_loan_ind,Home_loan_ind,Prime_Customer_ind,Overall_Satisfaction
0,CX01-001,49,23974,1,0,1,Dis Satisfied
1,CX01-002,25,72374,0,1,1,Satisfied
2,CX01-003,32,65532,0,0,1,Satisfied
3,CX01-004,70,28076,0,1,1,Dis Satisfied
4,CX01-005,23,38974,1,1,1,Satisfied


In [2]:
#Sample summary
summary=survey_data.describe()
round(summary,2)

Unnamed: 0,Age,Account_balance,Personal_loan_ind,Home_loan_ind,Prime_Customer_ind
count,12330.0,12330.0,12330.0,12330.0,12330.0
mean,44.77,41177.14,0.5,0.5,0.58
std,13.91,26432.6,0.5,0.5,0.49
min,19.0,4904.0,0.0,0.0,0.0
25%,35.0,20927.0,0.0,0.0,0.0
50%,43.0,34065.0,0.0,0.0,1.0
75%,55.0,60264.0,1.0,1.0,1.0
max,75.0,109776.0,1.0,1.0,1.0


In [3]:
survey_data['Overall_Satisfaction'].value_counts()
survey_data["Personal_loan_ind"].value_counts()
survey_data["Home_loan_ind"].value_counts()
survey_data["Prime_Customer_ind"].value_counts()

1    7113
0    5217
Name: Prime_Customer_ind, dtype: int64

In [4]:
#4.4.2
#Non numerical data need to be mapped to numerical data. 
survey_data['Overall_Satisfaction'] = survey_data['Overall_Satisfaction'].map( {'Dis Satisfied': 0, 'Satisfied': 1} ).astype(int)

#number of satisfied customers
survey_data['Overall_Satisfaction'].value_counts()

#Defining Features and lables, ignoring cust_num and target variable
features=list(survey_data.columns[1:6])
print(features)


['Age', 'Account_balance', 'Personal_loan_ind', 'Home_loan_ind', 'Prime_Customer_ind']


In [5]:
#Preparing X and Y data

X = survey_data[features]
y = survey_data['Overall_Satisfaction']

#Building Tree Model
from sklearn import tree
DT_Model = tree.DecisionTreeClassifier(max_depth=2)
DT_Model.fit(X,y)


DecisionTreeClassifier(max_depth=2)

In [11]:
##Plotting the trees - New Method

import matplotlib.pyplot as plt
from sklearn.tree import plot_tree, export_text
plt.figure(figsize=(15,7))
plot_tree(DT_Model, filled=True, 
                     rounded=True,
                     impurity=False,
                     feature_names = features)

print(export_text(DT_Model, feature_names = features))


|--- Account_balance <= 40140.50
|   |--- Age <= 30.50
|   |   |--- class: 1
|   |--- Age >  30.50
|   |   |--- class: 0
|--- Account_balance >  40140.50
|   |--- Age <= 50.50
|   |   |--- class: 1
|   |--- Age >  50.50
|   |   |--- class: 0



In [12]:
#LAB : Tree Validation
########################################
##########Tree Validation
#Tree Validation
predict1 = DT_Model.predict(X)
print(predict1)

from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(y, predict1)
print(cm)

total = sum(sum(cm))
#####from confusion matrix calculate accuracy
accuracy = (cm[0,0]+cm[1,1])/total
print(accuracy)

[0 1 1 ... 1 1 1]
[[6631   76]
 [ 834 4789]]
0.9261962692619627


## Customer Profile (Overfitting & Tree Pruning)

In [14]:
#LAB: Overfitting
#LAB: The problem of overfitting
############################################################################ 
##The problem of overfitting

import pandas as pd
overall_data = pd.read_csv(r"Datasets/Customer_profile_data.csv")

##print train.info()
print(overall_data.shape)

#First few records
print(overall_data.head())

# the data have string values we need to convert them into numerical values
overall_data['Gender'] = overall_data['Gender'].map( {'Male': 1, 'Female': 0} ).astype(int)
overall_data['Bought'] = overall_data['Bought'].map({'Yes':1, 'No':0}).astype(int)

#First few records
print(overall_data.head())

#Defining features, X and Y
features = list(overall_data.columns[1:3])
print(features)

X = overall_data[features]
y = overall_data['Bought']

print(X.shape)
print(y.shape)

(109, 4)
   Sr_no  Age  Gender Bought
0      1   45    Male    Yes
1      2   56    Male    Yes
2      3   49  Female    Yes
3      4   50  Female     No
4      5   75  Female     No
   Sr_no  Age  Gender  Bought
0      1   45       1       1
1      2   56       1       1
2      3   49       0       1
3      4   50       0       0
4      5   75       0       0
['Age', 'Gender']
(109, 2)
(109,)


In [15]:
#Dividing X and y to train and test data parts. The function train_test_split() takes care of it. Mention the train data percentage in the parameter train_size. 
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y, train_size = 0.8 , random_state=5)

print("X_train.shape", X_train.shape)
print("y_train.shape",y_train.shape)
print("X_test.shape",X_test.shape)
print("y_test.shape",y_test.shape)

X_train.shape (87, 2)
y_train.shape (87,)
X_test.shape (22, 2)
y_test.shape (22,)


In [19]:
from sklearn import tree
#training Tree Model
DT_Model1 = tree.DecisionTreeClassifier()
DT_Model1.fit(X_train,y_train)
# plotting is skipped because depedency issues (mentioned in the textbook too, GraphViz's executable not found)
# refer the textbook for the output

DecisionTreeClassifier()

In [20]:
#Accuracy on train data
from sklearn.metrics import confusion_matrix

predict1 = DT_Model1.predict(X_train)
cm1 = confusion_matrix(y_train,predict1)
total1 = sum(sum(cm1))
accuracy1 = (cm1[0,0]+cm1[1,1])/total1
print("Train accuracy", accuracy1)

#Accuracy on test data
predict2 = DT_Model1.predict(X_test)
cm2 = confusion_matrix(y_test,predict2)
total2 = sum(sum(cm2))
#####from confusion matrix calculate accuracy
accuracy2 = (cm2[0,0]+cm2[1,1])/total2
print("Test accuracy",accuracy2)

Train accuracy 0.9655172413793104
Test accuracy 0.7727272727272727


In [21]:
####LAB: Pruning
#### max_depth parameter 
DT_Model2 = tree.DecisionTreeClassifier(max_depth= 4)
DT_Model2.fit(X_train,y_train)

predict3 = DT_Model2.predict(X_train)
predict4 = DT_Model2.predict(X_test)

#Accuracy of the model on the train data
cm1 = confusion_matrix(y_train,predict3)
total1 = sum(sum(cm1))
accuracy1 = (cm1[0,0]+cm1[1,1])/total1
print("max_depth4 Train Accuracy", accuracy1)

#Accuracy of the model on the Test Data
cm2 = confusion_matrix(y_test,predict4)
total2 = sum(sum(cm2))
accuracy2 = (cm2[0,0]+cm2[1,1])/total2
print("max_depth4 Test Accuracy", accuracy2)

max_depth4 Train Accuracy 0.9425287356321839
max_depth4 Test Accuracy 0.7727272727272727


In [22]:
#### max_depth =2
DT_Model2 = tree.DecisionTreeClassifier(max_depth= 2)
DT_Model2.fit(X_train,y_train)

predict3 = DT_Model2.predict(X_train)
predict4 = DT_Model2.predict(X_test)

#Accuracy of the model on the train data
cm1 = confusion_matrix(y_train,predict3)
total1 = sum(sum(cm1))
accuracy1 = (cm1[0,0]+cm1[1,1])/total1
print("max_depth2 Train Accuracy", accuracy1)

#Accuracy of the model on the Test Data
cm2 = confusion_matrix(y_test,predict4)
total2 = sum(sum(cm2))
accuracy2 = (cm2[0,0]+cm2[1,1])/total2
print("max_depth2 Test Accuracy", accuracy2)

max_depth2 Train Accuracy 0.896551724137931
max_depth2 Test Accuracy 0.8636363636363636


In [23]:
#### The problem of underfitting
#### max_depth =1
DT_Model2 = tree.DecisionTreeClassifier(max_depth= 1)
DT_Model2.fit(X_train,y_train)

predict3 = DT_Model2.predict(X_train)
predict4 = DT_Model2.predict(X_test)

#Accuracy of the model on the train data
cm1 = confusion_matrix(y_train,predict3)
total1 = sum(sum(cm1))
accuracy1 = (cm1[0,0]+cm1[1,1])/total1
print("max_depth1 Train Accuracy", accuracy1)

#Accuracy of the model on the Test Data
cm2 = confusion_matrix(y_test,predict4)
total2 = sum(sum(cm2))
accuracy2 = (cm2[0,0]+cm2[1,1])/total2
print("max_depth1 Test Accuracy", accuracy2)

max_depth1 Train Accuracy 0.8735632183908046
max_depth1 Test Accuracy 0.8181818181818182


In [24]:
#### max_leaf_nodes =4
DT_Model3 = tree.DecisionTreeClassifier(max_leaf_nodes= 3)
DT_Model3.fit(X_train,y_train)

predict3 = DT_Model3.predict(X_train)
predict4 = DT_Model3.predict(X_test)

#Accuracy of the model on the train data
cm1 = confusion_matrix(y_train,predict3)
total1 = sum(sum(cm1))
accuracy1 = (cm1[0,0]+cm1[1,1])/total1
print(accuracy1)

#Accuracy of the model on the Test Data
cm2 = confusion_matrix(y_test,predict4)
total2 = sum(sum(cm2))
accuracy2 = (cm2[0,0]+cm2[1,1])/total2
print(accuracy2)

0.896551724137931
0.8636363636363636
