# Importing Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/codebasics/py/master/ML/9_decision_tree/Exercise/titanic.csv')

In [None]:
df.head()

In [None]:
df = df[['Survived','Pclass','Sex','Age','Fare']]

In [None]:
df.head()

# Cleaning DataSet

In [None]:
df.isnull().sum()

In [None]:
df.groupby('Sex')['Age'].mean()

In [None]:
df['Age']=df.groupby("Sex")['Age'].transform(lambda x: x.fillna(x.mean()))

# Changing category column to numerical column

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

In [None]:
# Task 1: Transform the categorical 'Sex' attribute to a numerical format

In [None]:
df.head()

In [None]:
X = df[['Pclass','Sex','Age','Fare']]
y= df['Survived']

# Doing train_test_split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Task 2: Split the data considering the testing corpus as 30% and random state to 101.

# Training the model


In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
clf = DecisionTreeClassifier(random_state=0,criterion='gini')
clf.fit(X_train,y_train)

# Checking accuracy of testing dataset

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
predictions_test=clf.predict(X_test)
# Task 3: Calculate the accuracy of the testing data

# Checking accuracy of training dataset

In [None]:
predictions_train = clf.predict(X_train)
accuracy_score(y_train,predictions_train)

#### Here we can clearly see that for training dataset our accuracy is very high whereas for test dataset it is very low,hence our model is overfitted and to avoid this we will use Pruning method later.

# Visualizing our final decision tree

In [None]:
from sklearn import tree
plt.figure(figsize=(15,10))
tree.plot_tree(clf,filled=True)
plt.show()

# Evaluating our test dataset

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
print(classification_report(y_test,predictions_test))
# Task 4: Generate the confusion matrix for evaluating the testing corpus

# Evaluating our training dataset

In [None]:
print(classification_report(y_train,predictions_train))
# Task 5: Generate the confusion matrix for evaluating the training corpus

# Finding false positive rate and true positive rate

In [None]:
from sklearn.metrics import roc_curve,auc

In [None]:
dt_probs = clf.predict_proba(X_test)[:,1]

In [None]:
fpr_dt, tpr_dt, thresholds_dt = roc_curve(y_test,dt_probs)

# Plotting *ROC* curve for our Decision Tree

In [None]:
auc_score_dt = auc(fpr_dt,tpr_dt)
auc_score_dt

In [None]:
def plot_roc_curve(fpr, tpr):
    plt.figure(figsize=(10,8))
    plt.plot(fpr_dt, tpr_dt, color='orange', label='AUC = %0.2f' % auc_score_dt)
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()

In [None]:
plot_roc_curve(fpr_dt,tpr_dt)

# Pruning of our decision tree

In [None]:
# Task 6: Obtain the corresponding total leaf impurities at each step of the pruning process
ccp_alphas, impurities = path.ccp_alphas, path.impurities

# Visualizing alpha w.r.t impurity of leaves

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")

In [None]:
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)
print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
      clfs[-1].tree_.node_count, ccp_alphas[-1]))

In [None]:
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]

node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
fig, ax = plt.subplots(2, 1,figsize=(10,8))
ax[0].plot(ccp_alphas, node_counts, marker='o', drawstyle="steps-post")
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("number of nodes")
ax[0].set_title("Number of nodes vs alpha")
ax[1].plot(ccp_alphas, depth, marker='o', drawstyle="steps-post")
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")
ax[1].set_title("Depth vs alpha")
fig.tight_layout()

In [None]:
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]

fig, ax = plt.subplots(figsize=(10,8))
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o', label="train",
        drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label="test",
        drawstyle="steps-post")
ax.legend()
plt.grid()
plt.show()

# Accuracy after pruning

In [None]:
clf = DecisionTreeClassifier(random_state=0, ccp_alpha=0.016)
clf.fit(X_train,y_train)

### Accuracy of test dataset

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
pred=clf.predict(X_test)
accuracy_score(y_test, pred)

### Accuracy of training dataset

In [None]:
pred_1 = clf.predict(X_train)
accuracy_score(y_train,pred_1)

# Visualizing after pruning

In [None]:
from sklearn import tree
plt.figure(figsize=(15,10))
tree.plot_tree(clf,filled=True)