In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

In [2]:
titanic = pd.read_csv('https://raw.githubusercontent.com/dhminh1024/practice_datasets/master/titanic.csv')

# Data manipulation
titanic.fillna(titanic['Age'].mean(), inplace=True)
titanic.replace({'Sex':{'male':0, 'female':1}}, inplace=True)
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] + 1
titanic.drop(columns=['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], inplace=True)
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,FamilySize
0,0,3,0,22.0,2
1,1,1,1,38.0,2
2,1,3,1,26.0,1
3,1,1,1,35.0,2
4,0,3,0,35.0,1


In [3]:
X = titanic[['Pclass', 'Sex', 'Age', 'FamilySize']].values
y = titanic[['Survived']].values

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print('Training set:', x_train.shape, y_train.shape)
print('Test set:', x_test.shape, y_test.shape)

Training set: (801, 4) (801, 1)
Test set: (90, 4) (90, 1)


In [4]:
def bootstrampping(X, y, num_of_sample= 100, num_of_element= 50):
    x_sample = []
    y_sample = []
    for i in range(num_of_sample):
        ha = np.random.randint(0, X.shape[0], num_of_element)
        x_sample.append(X[ha])
        y_sample.append(y[ha])
    return x_sample, y_sample

In [5]:
def voting_quantity(lis):
    return max(set(lis), key=list(lis).count)

In [6]:
def MetaBaggingClass(model, X, y, test, num_of_sample= 100, num_of_element= 50):
    test_pred = []
    print("Training")
    print("<+++======================+++>")
    # Bootsrampping data
    x_sample, y_sample = bootstrampping(X, y, num_of_sample= num_of_sample, num_of_element= num_of_element)
    
    for i in range(len(x_sample)):
        model.fit(x_sample[i], y_sample[i])
        test_pred.append(model.predict(test))
        
    # Voting result
    lis = [[] for i in range(test.shape[0])]
    for i in range(test.shape[0]):
        for j in range(len(test_pred)):
            lis[i].append(test_pred[j][i])
           
    rel = []
    for i in range(test.shape[0]):
        rel.append(voting_quantity(lis[i]))
        
    print("Training model done!!!")
    
    return rel

In [7]:
# num_of_sample vài trăm càng tốt, num_of_element thường = bộ số lượng data train
rel = MetaBaggingClass(LogisticRegression(), x_train, y_train, x_test)

Training
Training model done!!!


In [8]:
print("Accuracy score: ", accuracy_score(y_test, np.array(rel)))
print("F1-score: ", f1_score(y_test, np.array(rel)))
print("Recall-score: ", recall_score(y_test,np.array(rel)))

Accuracy score:  0.8888888888888888
F1-score:  0.8571428571428571
Recall-score:  0.8333333333333334
