Decision Trees

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression



In [None]:
#Load & preprocess
df = pd.read_csv('titanic.csv')
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Age'].fillna(df['Age'].median(), inplace=True)
X = df[['Pclass', 'Age', 'Sex']]
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)


Build decision tree classifier

In [11]:
# Initialize and train the Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
# Calculate metrics , same for all
accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt)
recall_dt = recall_score(y_test, y_pred_dt)
print(f"Decision Tree - Accuracy: {accuracy_dt:.2f},Precision: {precision_dt:.2f}, Recall: {recall_dt:.2f}")
#RANDOM FOREST
'''rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
#BAGGING CLASSIFIER
bag_model = BaggingClassifier(estimator=DecisionTreeClassifier(),
                               n_estimators=50, random_state=42)
bag_model.fit(X_train, y_train)
y_pred_bag = bag_model.predict(X_test)
#GRADIENT BOOSTING
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
estimators = [   #STACKING
    ('rf', RandomForestClassifier(n_estimators=50, random_state=42)),
    ('dt', DecisionTreeClassifier())]
stack_model = StackingClassifier(estimators=estimators,
                                  final_estimator=LogisticRegression())
stack_model.fit(X_train, y_train)
y_pred_stack = stack_model.predict(X_test)'''



Decision Tree - Accuracy: 0.78,Precision: 0.76, Recall: 0.68


"rf_model = RandomForestClassifier(n_estimators=100, random_state=42)\nrf_model.fit(X_train, y_train)\ny_pred_rf = rf_model.predict(X_test)\n#BAGGING CLASSIFIER\nbag_model = BaggingClassifier(estimator=DecisionTreeClassifier(),\n                               n_estimators=50, random_state=42)\nbag_model.fit(X_train, y_train)\ny_pred_bag = bag_model.predict(X_test)\n#GRADIENT BOOSTING\ngb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)\ngb_model.fit(X_train, y_train)\ny_pred_gb = gb_model.predict(X_test)\nestimators = [   #STACKING\n    ('rf', RandomForestClassifier(n_estimators=50, random_state=42)),\n    ('dt', DecisionTreeClassifier())]\nstack_model = StackingClassifier(estimators=estimators,\n                                  final_estimator=LogisticRegression())\nstack_model.fit(X_train, y_train)\ny_pred_stack = stack_model.predict(X_test)"

Random Forest Classifier

In [4]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
print(f"Random Forest - Accuracy: {accuracy_rf:.2f}, Precision: {precision_rf:.2f}, Recall: {recall_rf:.2f}")


Random Forest - Accuracy: 0.79, Precision: 0.78, Recall: 0.70


Bagging Classifier

In [5]:
# Initialize and train Bagging Classifier
bag_model = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, random_state=42)
bag_model.fit(X_train, y_train)
y_pred_bag = bag_model.predict(X_test)

accuracy_bag = accuracy_score(y_test, y_pred_bag)
precision_bag = precision_score(y_test, y_pred_bag)
recall_bag = recall_score(y_test, y_pred_bag)

print(f"Bagging - Accuracy: {accuracy_bag:.2f}, Precision: {precision_bag:.2f}, Recall: {recall_bag:.2f}")


Bagging - Accuracy: 0.79, Precision: 0.76, Recall: 0.70


Boosting (e.g., AdaBoost)

In [6]:
ada_model = AdaBoostClassifier(n_estimators=50, random_state=42)
ada_model.fit(X_train, y_train)
y_pred_ada = ada_model.predict(X_test)

accuracy_ada = accuracy_score(y_test, y_pred_ada)
precision_ada = precision_score(y_test, y_pred_ada)
recall_ada = recall_score(y_test, y_pred_ada)

print(f"AdaBoost - Accuracy: {accuracy_ada:.2f}, Precision: {precision_ada:.2f}, Recall: {recall_ada:.2f}")


AdaBoost - Accuracy: 0.80, Precision: 0.78, Recall: 0.73




Gradient Boosting

In [7]:
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

accuracy_gb = accuracy_score(y_test, y_pred_gb)
precision_gb = precision_score(y_test, y_pred_gb)
recall_gb = recall_score(y_test, y_pred_gb)

print(f"Gradient Boosting - Accuracy: {accuracy_gb:.2f}, Precision: {precision_gb:.2f}, Recall: {recall_gb:.2f}")


Gradient Boosting - Accuracy: 0.81, Precision: 0.83, Recall: 0.68


Stacking Classifier

In [8]:
estimators = [
    ('rf', RandomForestClassifier(n_estimators=50, random_state=42)),
    ('dt', DecisionTreeClassifier())
]
stack_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
stack_model.fit(X_train, y_train)
y_pred_stack = stack_model.predict(X_test)

accuracy_stack = accuracy_score(y_test, y_pred_stack)
precision_stack = precision_score(y_test, y_pred_stack)
recall_stack = recall_score(y_test, y_pred_stack)

print(f"Stacking - Accuracy: {accuracy_stack:.2f}, Precision: {precision_stack:.2f}, Recall: {recall_stack:.2f}")


Stacking - Accuracy: 0.79, Precision: 0.78, Recall: 0.70


In [12]:
# Create a DataFrame with the calculated metrics for each model
comparison_table = pd.DataFrame({
    'Model': ['Decision Tree', 'Random Forest', 'Bagging', 'AdaBoost', 'Gradient Boosting', 'Stacking'],
    'Accuracy': [accuracy_dt, accuracy_rf, accuracy_bag, accuracy_ada, accuracy_gb, accuracy_stack],
    'Precision': [precision_dt, precision_rf, precision_bag, precision_ada, precision_gb, precision_stack],
    'Recall': [recall_dt, recall_rf, recall_bag, recall_ada, recall_gb, recall_stack]
})

print(comparison_table)


               Model  Accuracy  Precision    Recall
0      Decision Tree  0.776536   0.757576  0.675676
1      Random Forest  0.793296   0.776119  0.702703
2            Bagging  0.787709   0.764706  0.702703
3           AdaBoost  0.804469   0.782609  0.729730
4  Gradient Boosting  0.810056   0.833333  0.675676
5           Stacking  0.793296   0.776119  0.702703
