In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

In [29]:
titanic = pd.read_csv('train.csv')
# Data manipulation
titanic.fillna(titanic['Age'].mean(), inplace=True)
titanic.replace({'Sex':{'male':0, 'female':1}}, inplace=True)
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] + 1
titanic.drop(columns=['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], inplace=True)
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,FamilySize
0,0,3,0,22.0,2
1,1,1,1,38.0,2
2,1,3,1,26.0,1
3,1,1,1,35.0,2
4,0,3,0,35.0,1


In [30]:
X = titanic[['Pclass', 'Sex', 'Age', 'FamilySize']].values
y = titanic[['Survived']].values

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print('Training set:', x_train.shape, y_train.shape)
print('Test set:', x_test.shape, y_test.shape)

Training set: (801, 4) (801, 1)
Test set: (90, 4) (90, 1)


In [31]:
x_test_ex = pd.read_csv('test.csv')
x_test_ex.fillna(x_test_ex['Age'].mean(), inplace=True)
x_test_ex.replace({'Sex':{'male':0, 'female':1}}, inplace=True)
x_test_ex['FamilySize'] = x_test_ex['SibSp'] + x_test_ex['Parch'] + 1
x_test_ex.drop(columns=['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], inplace=True)
x_test_ex.head()

Unnamed: 0,Pclass,Sex,Age,FamilySize
0,3,0,34.5,1
1,3,1,47.0,2
2,2,0,62.0,1
3,3,0,27.0,1
4,3,1,22.0,3


In [11]:
ha = AdaBoostClassifier(base_estimator= DecisionTreeClassifier(max_depth= 5))
ha.fit(x_train, y_train)
accuracy_score(ha.predict(x_test))

0.8333333333333334

In [18]:
import xgboost as xgb
model=xgb.XGBClassifier(random_state=1,learning_rate=0.01)
model.fit(x_train, y_train)
model.score(x_test,y_test)



0.8222222222222222

In [19]:
he = RandomForestClassifier(n_estimators= 200, min_samples_leaf= 3, max_features= 0.5, n_jobs= -1)
he.fit(x_train, y_train)
model.score(x_test,y_test)

0.8222222222222222

In [23]:
base_classifiers = [('Decision Tree', DecisionTreeClassifier()),
                    ('XGBoost', xgb.XGBClassifier(random_state=1,learning_rate=0.01)),
                    ('Logistic Regression', LogisticRegression()),
                    ('Random Forest', RandomForestClassifier(n_estimators= 200, min_samples_leaf= 3, max_features= 0.5, n_jobs= -1)),
                    
                    ('Gradient Boosting Classifier', GradientBoostingClassifier())]


In [20]:
def bootstrampping(X, y, num_of_sample= 100, num_of_element= 50):
    x_sample = []
    y_sample = []
    for i in range(num_of_sample):
        ha = np.random.randint(0, X.shape[0], num_of_element)
        x_sample.append(X[ha])
        y_sample.append(y[ha])
    return x_sample, y_sample

In [21]:
def voting_quantity(lis):
    return max(set(lis), key=list(lis).count)

In [22]:
def training(model,train,y,test,n_fold):
    folds=StratifiedKFold(n_splits=n_fold, random_state=42, shuffle= True)
    train_pred= [0 for i in range(train.shape[0])]
    test_pred = []
    
    for i_train, i_val in folds.split(train,y):
        x_train, y_train, y_train, y_val = [], [], [], []
    
        x_train, x_val = train[i_train], train[i_val]
        y_train, y_val = y[i_train], y[i_val]

        model.fit(X=x_train, y=y_train)
        
        # Add to train_pred
        pre = model.predict(x_val)
        for i,local in enumerate(i_val):
            train_pred[local] = pre[i]
            
        # Add to test_pred
        test_pred.append(model.predict(test))
    
    # Voting result
    lis = [[] for i in range(test.shape[0])]
    for i in range(test.shape[0]):
        for j in range(len(test_pred)):
            lis[i].append(test_pred[j][i])
            
    rel = []
    for i in range(test.shape[0]):
        rel.append(voting_quantity(lis[i]))
    return rel, train_pred

In [8]:
base_classifiers = [('Decision Tree', DecisionTreeClassifier()),
                    ('KNN', KNeighborsClassifier()),
                    ('Naive Bayes', GaussianNB()),
                    ('Logistic Regression', LogisticRegression()),
                    ('Random Forest', RandomForestClassifier()),
                    ('AdaBoost Classifier', AdaBoostClassifier()),
                    ('Gradient Boosting Classifier', GradientBoostingClassifier()),
                    ('Extra Trees Classifier', ExtraTreesClassifier())]

#                     ('SVM', SVC(kernel='linear', C=1e3)),

In [24]:
def Stacking(base_classifiers, n_fold, x_train, x_test,y_train):
    test_pred ,train_pred = [0 for i in range(len(base_classifiers))], [0 for i in range(len(base_classifiers))]

    for i, model in enumerate(base_classifiers):
        test_pred[i], train_pred[i] = training(model= model[1],n_fold=n_fold, train=x_train,test=x_test,y=y_train)
        print("Training ", model[1], "done!!!")
        print("<+++======================+++>")
        
    x_test_pred = [[] for i in range(np.array(test_pred).shape[1])]
    for i in range(np.array(test_pred).shape[1]):
        for j in range(len(test_pred)):
            x_test_pred[i].append(test_pred[j][i])
            
    x_train_pred = [[] for i in range(np.array(train_pred).shape[1])]
    for i in range(np.array(train_pred).shape[1]):
        for j in range(len(train_pred)):
            x_train_pred[i].append(train_pred[j][i])
            
    return x_test_pred, x_train_pred

In [32]:
x_test_pred, x_train_pred = Stacking(base_classifiers,10,X, x_test_ex,y)



Training  DecisionTreeClassifier() done!!!


ValueError: feature_names mismatch: ['f0', 'f1', 'f2', 'f3'] ['Pclass', 'Sex', 'Age', 'FamilySize']
expected f3, f1, f2, f0 in input data
training data did not have the following fields: Sex, Pclass, Age, FamilySize

In [27]:
model = AdaBoostClassifier()
model.fit(x_train_pred, y_train)
model.score(x_test_pred, y_test)

0.8333333333333334

(418,)

In [70]:
def MetaBaggingClass(model, X, y, test, num_of_sample= 128, num_of_element= 50):
    test_pred = []
    print("Training")
    print("<+++======================+++>")
    # Bootsrampping data
    x_sample, y_sample = bootstrampping(X, y, num_of_sample= num_of_sample, num_of_element= num_of_element)
    
    for i in range(len(x_sample)):
        model.fit(x_sample[i], y_sample[i])
        test_pred.append(model.predict(test))
        
    # Voting result
    lis = [[] for i in range(test.shape[0])]
    for i in range(test.shape[0]):
        for j in range(len(test_pred)):
            lis[i].append(test_pred[j][i])
           
    rel = []
    for i in range(test.shape[0]):
        rel.append(voting_quantity(lis[i]))
        
    print("Training model done!!!")
    
    return rel

In [76]:
# num_of_sample vài trăm càng tốt, num_of_element thường = bộ số lượng data train
rel = MetaBaggingClass(SVC(kernel='linear'), x_train, y_train, x_test)
print("Accuracy score: ", accuracy_score(y_test, np.array(rel)))
print("F1-score: ", f1_score(y_test, np.array(rel)))
print("Recall-score: ", recall_score(y_test,np.array(rel)))

Training
Training model done!!!
Accuracy score:  0.8111111111111111
F1-score:  0.7671232876712328
Recall-score:  0.7777777777777778


In [77]:
rely = MetaBaggingClass(SVC(kernel='linear'), X, y, x_test_ex)

Training
Training model done!!!


In [17]:
len(rely)

NameError: name 'rely' is not defined

In [79]:
cv = pd.read_csv('test.csv')

In [80]:
cv['PassengerId']

0       892
1       893
2       894
3       895
4       896
       ... 
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64

In [81]:
xx = pd.DataFrame({'PassengerId': cv['PassengerId'],'Survived': rely})

In [82]:
compression_opts = dict(method='zip',
                        archive_name='out123.csv')  
xx.to_csv('he.zip', index=False,
          compression=compression_opts)  