In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import seaborn as sns
from scipy.stats import mode
import time

In [2]:
data = sns.load_dataset('titanic')

In [3]:
df = data.drop(['deck','class','who','adult_male','embark_town','alive','alone'],axis=1)

In [4]:
df['age'] = df['age'].fillna(df.groupby('sex')['age'].transform('mean'))

In [5]:
df['age'] = df['age'].fillna(df.groupby('sex')['age'].transform('mean'))

In [6]:
df = pd.get_dummies(df, columns=['sex','embarked','pclass'], dtype=int)

In [7]:
y = df['survived']
X = df.drop('survived', axis=1)

In [11]:
all_columns = df.columns.to_list()
random_columns = [np.random.choice(all_columns, 3, replace=True).tolist() for i in range (10)]

In [12]:
random_columns

[['pclass_1', 'pclass_3', 'age'],
 ['embarked_Q', 'embarked_C', 'sex_male'],
 ['age', 'age', 'pclass_2'],
 ['survived', 'pclass_3', 'pclass_3'],
 ['embarked_C', 'sex_male', 'pclass_3'],
 ['sex_female', 'pclass_3', 'pclass_1'],
 ['embarked_Q', 'sibsp', 'parch'],
 ['age', 'embarked_S', 'pclass_1'],
 ['sex_female', 'pclass_3', 'survived'],
 ['age', 'age', 'sex_male']]

In [10]:
all_columns

['survived',
 'age',
 'sibsp',
 'parch',
 'fare',
 'sex_female',
 'sex_male',
 'embarked_C',
 'embarked_Q',
 'embarked_S',
 'pclass_1',
 'pclass_2',
 'pclass_3']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
def build_decision_trees(X_train, y_train, num_models=10, num_columns=3):
    all_columns = X_train.columns.to_list()
    models = []
    for i in range(num_models):
        selected_columns = np.random.choice(all_columns, num_columns, replace=True).tolist()
        X_train_subset = X_train[selected_columns]
        tree = DecisionTreeClassifier(random_state=42)
        tree.fit(X_train_subset, y_train)
        models.append((tree, selected_columns))
    return models

In [15]:
models = build_decision_trees(X_train, y_train, num_models=10, num_columns=3)

In [16]:
models

[(DecisionTreeClassifier(random_state=42), ['sex_female', 'fare', 'pclass_3']),
 (DecisionTreeClassifier(random_state=42), ['embarked_C', 'age', 'sex_male']),
 (DecisionTreeClassifier(random_state=42), ['pclass_3', 'embarked_C', 'age']),
 (DecisionTreeClassifier(random_state=42),
  ['embarked_C', 'pclass_3', 'embarked_C']),
 (DecisionTreeClassifier(random_state=42), ['pclass_1', 'sex_male', 'fare']),
 (DecisionTreeClassifier(random_state=42), ['sibsp', 'age', 'embarked_Q']),
 (DecisionTreeClassifier(random_state=42), ['pclass_1', 'pclass_1', 'parch']),
 (DecisionTreeClassifier(random_state=42), ['pclass_2', 'sex_female', 'age']),
 (DecisionTreeClassifier(random_state=42), ['pclass_3', 'pclass_1', 'age']),
 (DecisionTreeClassifier(random_state=42),
  ['pclass_1', 'embarked_Q', 'sex_male'])]

In [17]:
def measure_accuracy(models, X_train, y_train):
    predictions = []
    for tree, selected_columns in models:
        X_train_subset = X_train[selected_columns]
        pred = tree.predict(X_train_subset)
        predictions.append(pred)
    predictions = np.array(predictions).T
    final_predictions, i = mode(predictions, axis=1)
    final_predictions = final_predictions.flatten()
    accuracy = accuracy_score(y_train, final_predictions)
    #print(predictions)
    #print(final_predictions)
    return accuracy

In [18]:
y_train.head()

331    0
733    0
382    0
704    0
813    0
Name: survived, dtype: int64

In [19]:
train_accuracy = measure_accuracy(models, X_train, y_train)

In [20]:
train_accuracy

0.875