In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import skew

from sklearn.metrics import accuracy_score

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from mlxtend.classifier import StackingClassifier

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
all_data = pd.concat((
    train.loc[:,'Pclass':],
     test.loc[:,'Pclass':]))
y = train['Survived']

In [3]:
all_data.isna().sum()

Pclass         0
Name           0
Sex            0
Age          263
SibSp          0
Parch          0
Ticket         0
Fare           1
Cabin       1014
Embarked       2
dtype: int64

In [4]:
# PClass (apesar de numérico) -> dummies
# Names -> *transformar em títulos* e dummies
# Age -> preencher com média por grupos
# Sex -> Dummies
# SibSp e Parch -> transformar em tamanho da família
# Ticket -> Manter só as letras
# Fare -> preencher nulos com média
# Cabin -> transformar em deck, preencher com moda (U) e dummies
# Embarked -> preencher nulos com a moda e depois dummies

In [5]:
title_map = {
    'Capt':         'Officer',
    'Col':          'Officer',
    'Major':        'Officer',
    'Jonkheer':     'Royalty',
    'Don':          'Royalty',
    'Sir' :         'Royalty',
    'Dr':           'Officer',
    'Rev':          'Officer',
    'the Countess': 'Royalty',
    'Dona':         'Royalty',
    'Mme':          'Mrs',
    'Mlle':         'Miss',
    'Ms':           'Mrs',
    'Mr' :          'Mr',
    'Mrs' :         'Mrs',
    'Miss' :        'Miss',
    'Master' :      'Master',
    'Lady' :        'Royalty'
}
all_data['Title'] = all_data['Name'].map(lambda name: name.split(',')[1].split('.')[0].strip())
all_data['Title'] = all_data['Title'].map(title_map)
titles_dummies = pd.get_dummies(all_data['Title'], prefix='Title', drop_first=True)
all_data = pd.concat([all_data, titles_dummies], axis=1)
all_data.drop('Name', axis=1, inplace=True)

In [6]:
def fillAges(row):
    if row['Sex'] == 'female' and row['Pclass'] == 1:
        if row['Title'] == 'Miss':
            return 30
        elif row['Title'] == 'Mrs':
            return 45
        elif row['Title'] == 'Officer':
            return 49
        elif row['Title'] == 'Royalty':
            return 39

    elif row['Sex'] == 'female' and row['Pclass'] == 2:
        if row['Title'] == 'Miss':
            return 20
        elif row['Title'] == 'Mrs':
            return 30

    elif row['Sex'] == 'female' and row['Pclass'] == 3:
        if row['Title'] == 'Miss':
            return 18
        elif row['Title'] == 'Mrs':
            return 31

    elif row['Sex'] == 'male' and row['Pclass'] == 1:
        if row['Title'] == 'Master':
            return 6
        elif row['Title'] == 'Mr':
            return 41.5
        elif row['Title'] == 'Officer':
            return 52
        elif row['Title'] == 'Royalty':
            return 40

    elif row['Sex'] == 'male' and row['Pclass'] == 2:
        if row['Title'] == 'Master':
            return 2
        elif row['Title'] == 'Mr':
            return 30
        elif row['Title'] == 'Officer':
            return 41.5

    elif row['Sex'] == 'male' and row['Pclass'] == 3:
        if row['Title'] == 'Master':
            return 6
        elif row['Title'] == 'Mr':
            return 26
all_data['Age'] = all_data.apply(lambda r : fillAges(r) if np.isnan(r['Age']) else r['Age'], axis=1)
all_data.drop('Title', axis=1, inplace=True)

In [7]:
all_data['Sex'] = all_data['Sex'].map(lambda x: 1 if x == 'male' else 0)

In [8]:
# introducing other features based on the family size
all_data['FamilySize']  = all_data['Parch'] + all_data['SibSp'] + 1
all_data['Singleton']   = all_data['FamilySize'].map(lambda s : 1 if s == 1 else 0)
all_data['SmallFamily'] = all_data['FamilySize'].map(lambda s : 1 if 2 <= s <= 4 else 0)
all_data['LargeFamily'] = all_data['FamilySize'].map(lambda s : 1 if 5 <= s else 0)
all_data.drop(['SibSp', 'Parch'], axis=1, inplace=True)

In [9]:
all_data['Ticket'] = all_data['Ticket'].map(lambda x: ''.join(filter(str.isalpha, x)))
all_data['Ticket'] = all_data['Ticket'].map(lambda x: x if x else 'XXX')
tickets_dummies = pd.get_dummies(all_data['Ticket'], prefix='Ticket', drop_first=True)
all_data = pd.concat([all_data, tickets_dummies], axis=1)

train_unique_tickets = ['CASOTON','SP','Fa','SCOW','SC','SOP','AS','SCAHBasle','FC','SCA']
test_unique_tickets = ['LP','SC','WEP','SOTONO','STONOQ','PP','SCParis']
all_data['Ticket'].value_counts().sort_values(ascending=True).head(10)

Ticket
LP           1
SOP          1
SCOW         1
AS           1
SCAHBasle    1
Fa           1
CASOTON      1
STONOQ       1
SP           1
SC           2
Name: count, dtype: int64

In [10]:
train_tickets_count = all_data[train.shape[0]:]['Ticket'].value_counts()
test_tickets_count = all_data[:train.shape[0]]['Ticket'].value_counts()

train_unique_tickets = list(train_tickets_count[train_tickets_count == 1].keys())
test_unique_tickets = list(test_tickets_count[test_tickets_count == 1].keys())

unique_tickets = [f'Ticket_{ticket}' for ticket in set(train_unique_tickets + test_unique_tickets)]

all_data.drop('Ticket', inplace=True, axis=1)
all_data.drop(unique_tickets, inplace=True, axis=1)

In [11]:
fare_mean = all_data['Fare'].mean()
all_data['Fare'] = all_data['Fare'].fillna(fare_mean)

In [12]:
# consider dropping Cabin column
all_data['Cabin'] = all_data['Cabin'].fillna('U')
all_data['Cabin'] = all_data['Cabin'].map(lambda x: x[0])
cabin_dummies = pd.get_dummies(all_data['Cabin'], prefix='Cabin', drop_first=True)
all_data = pd.concat([all_data, cabin_dummies], axis=1)
all_data.drop('Cabin', inplace=True, axis=1)

In [13]:
embarked_mode = all_data['Embarked'].mode()
all_data['Embarked'] = all_data['Embarked'].fillna(embarked_mode)
embarked_dummies = pd.get_dummies(all_data['Embarked'], prefix='Embarked', drop_first=True)
all_data = pd.concat([all_data, embarked_dummies], axis=1)
all_data.drop('Embarked', inplace=True, axis=1)

In [14]:
# select numerical features
numeric_feats = all_data.dtypes[all_data.dtypes != 'object'].index
# calculate skew
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna()))
# only filter skew higher than 0.75
skewed_feats = skewed_feats[skewed_feats > 0.75].index
# normalize with log(1+x)
all_data[skewed_feats] = np.log1p(all_data[skewed_feats]) 

In [15]:
scaler = MinMaxScaler().fit(all_data)
X = scaler.transform(all_data)
all_data = pd.DataFrame(X, columns=all_data.columns)

In [16]:
X = all_data[:train.shape[0]]
test = all_data[train.shape[0]:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [17]:
# model_instances = [
#     (RandomForestClassifier(), 'RandomForestClassifier'),
#     (ExtraTreesClassifier(), 'ExtraTreesClassifier'),
#     (GradientBoostingClassifier(), 'GradientBoostingClassifier'),
#     (LogisticRegression(), 'LogisticRegression'),
#     (DecisionTreeClassifier(), 'DecisionTreeClassifier'),
#     (KNeighborsClassifier(), 'KNeighborsClassifier'),
#     (GaussianNB(), 'GaussianNB'),
#     (Perceptron(), 'Perceptron'),
#     (SGDClassifier(), 'SGDClassifier'),
#     (SVC(), 'SVC'),
#     (LinearSVC(), 'LinearSVC'),
#     (LGBMClassifier(verbose=0), 'LGBMClassifier'),
#     (XGBClassifier(), 'XGBClassifier'),
#     (CatBoostClassifier(verbose=False), 'CatBoostClassifier'), 
# ] 

In [18]:
# results = {
#     'Model':[],
#     'ACC':[]
# }

In [19]:
# for model, model_name in model_instances:
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     results['Model'].append(model_name)
#     results['ACC'].append(accuracy_score(y_test, y_pred))

In [20]:
# results = pd.DataFrame(results)
# results

In [21]:
# results_temp = results.sort_values('ACC', ascending=False)
# results_temp.iloc[:5]['Model']

In [22]:
model_instances = [
    (LogisticRegression(**{'penalty': 'l2', 'solver': 'liblinear', 'C':0.2}), 'LogisticRegression'),
    (SVC(**{'C': 5, 'degree': 2, 'gamma': 0.1, 'kernel': 'poly'}), 'SVC'),
    (XGBClassifier(**{'eta':0.1, 'gamma':0, 'max_depth':6, 'lambda':0.1, 'alpha':10}), 'XGBClassifier'),
    (CatBoostClassifier(**{'rsm':0.1, 'learning_rate':0.005, 'iterations':500, 'l2_leaf_reg':5, 'verbose':False}), 'CatBoostClassifier')
] 

results = {
    'Model':[],
    'ACC':[]
}

for model, model_name in model_instances:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results['Model'].append(model_name)
    results['ACC'].append(accuracy_score(y_test, y_pred))

In [23]:
results = pd.DataFrame(results)
results

Unnamed: 0,Model,ACC
0,LogisticRegression,0.811659
1,SVC,0.834081
2,XGBClassifier,0.825112
3,CatBoostClassifier,0.829596


In [24]:
svc = SVC(**{'C': 5, 'degree': 2, 'gamma': 0.1, 'kernel': 'poly'})
xgb = XGBClassifier(**{'eta': 0.1, 'gamma': 0, 'max_depth':6, 'lambda': 0.1, 'alpha': 10})
cat = CatBoostClassifier(**{'rsm': 0.1, 'learning_rate': 0.005, 'iterations': 500, 'l2_leaf_reg': 5, 'verbose': False})
lor = LogisticRegression(**{'penalty': 'l2', 'solver': 'liblinear', 'C': 0.2})

stack_gen=StackingClassifier(
    classifiers=(svc, xgb, cat, lor),
    meta_classifier=LogisticRegression(**{'penalty': 'l2', 'solver': 'liblinear', 'C': 0.2}),
    use_features_in_secondary=True)

stack_gen.fit(X.values, y.values)
prediction = stack_gen.predict(test.values)

In [25]:
# # Escolha dos melhores parâmetros
# randomForest = RandomForestClassifier()
# cross_validation = StratifiedKFold(n_splits=5) # n_folds deve ser escolhido de forma precisa
# parameter_grid = {
#                  'max_depth' : [18,19],
#                  'n_estimators': [1000,2000,3000],
#                  'criterion': ['gini','entropy',]
#                  }
# grid_search = GridSearchCV(randomForest,param_grid=parameter_grid,cv=cross_validation)
# grid_search.fit(X_train, y_train)
# print('Best score: {}'.format(grid_search.best_score_))
# print('Best parameters: {}'.format(grid_search.best_params_))

In [26]:
# random_forest = RandomForestClassifier(
#     n_estimators=10000, # tunado
#     criterion='entropy', # tunado
#     max_depth=15, # tunado
#     min_samples_split=27, # tunado
#     max_features=8)

# random_forest.fit(X, y)
# y_pred = random_forest.predict(test)

In [27]:
sample_submission = pd.read_csv('sample_submission.csv', index_col=0)
sample_submission['Survived'] = prediction
sample_submission.to_csv('stack_tunado.csv')