In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
titanic = pd.read_csv('https://raw.githubusercontent.com/dhminh1024/practice_datasets/master/titanic.csv')

# Data manipulation
titanic.fillna(titanic['Age'].mean(), inplace=True)
titanic.replace({'Sex':{'male':0, 'female':1}}, inplace=True)
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] + 1
titanic.drop(columns=['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], inplace=True)
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,FamilySize
0,0,3,0,22.0,2
1,1,1,1,38.0,2
2,1,3,1,26.0,1
3,1,1,1,35.0,2
4,0,3,0,35.0,1


In [3]:
X = titanic[['Pclass', 'Sex', 'Age', 'FamilySize']].values
y = titanic[['Survived']].values

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print('Training set:', x_train.shape, y_train.shape)
print('Test set:', x_test.shape, y_test.shape)

Training set: (801, 4) (801, 1)
Test set: (90, 4) (90, 1)


In [5]:
def BlendingClassifier(base_classifiers, X, y, test):
    x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    val_pred = []
    test_pred = []
    
    for model in base_classifiers:
        model[1].fit(x_train, y_train)
        val_pred.append(model[1].predict(x_val))
        test_pred.append(model[1].predict(test))
        print("Training ", model[1], "done!!!")
        print("<+++======================+++>")
        
    x_val_pred = [[] for i in range(np.array(val_pred).shape[1])]
    for i in range(np.array(val_pred).shape[1]):
        for j in range(len(val_pred)):
            x_val_pred[i].append(val_pred[j][i])
            
    x_test_pred = [[] for i in range(np.array(test_pred).shape[1])]
    for i in range(np.array(test_pred).shape[1]):
        for j in range(len(test_pred)):
            x_test_pred[i].append(test_pred[j][i])
            
    return x_val_pred, x_test_pred, y_val

In [6]:
base_classifiers = [('Decision Tree', DecisionTreeClassifier()),
                    ('KNN', KNeighborsClassifier()),
                    ('Naive Bayes', GaussianNB()),
                    ('Logistic Regression', LogisticRegression()),
                    ('Random Forest', RandomForestClassifier()),
                    ('AdaBoost Classifier', AdaBoostClassifier()),
                    ('Gradient Boosting Classifier', GradientBoostingClassifier()),
                    ('Extra Trees Classifier', ExtraTreesClassifier())]
# ('SVM', SVC(kernel='linear', C=1e3)),

In [7]:
x_val_pred, x_test_pred, y_val = BlendingClassifier(base_classifiers, x_train, y_train, x_test)

model = ExtraTreesClassifier()
model.fit(x_val_pred, y_val)
rel = model.predict(x_test_pred)

Training  DecisionTreeClassifier() done!!!
Training  KNeighborsClassifier() done!!!
Training  GaussianNB() done!!!
Training  LogisticRegression() done!!!
Training  RandomForestClassifier() done!!!
Training  AdaBoostClassifier() done!!!
Training  GradientBoostingClassifier() done!!!
Training  ExtraTreesClassifier() done!!!


In [8]:
print("Accuracy score: ", accuracy_score(y_test, rel))
print("F1-score: ", f1_score(y_test, rel))
print("Recall-score: ", recall_score(y_test,rel))

Accuracy score:  0.8666666666666667
F1-score:  0.8378378378378377
Recall-score:  0.8611111111111112
