In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import skew

from sklearn.metrics import accuracy_score

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from mlxtend.regressor import StackingCVRegressor

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
all_data = pd.concat((
    train.loc[:,'Pclass':],
     test.loc[:,'Pclass':]))
y = train['Survived']

In [3]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1309 entries, 0 to 417
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    1309 non-null   int64  
 1   Name      1309 non-null   object 
 2   Sex       1309 non-null   object 
 3   Age       1046 non-null   float64
 4   SibSp     1309 non-null   int64  
 5   Parch     1309 non-null   int64  
 6   Ticket    1309 non-null   object 
 7   Fare      1308 non-null   float64
 8   Cabin     295 non-null    object 
 9   Embarked  1307 non-null   object 
dtypes: float64(2), int64(3), object(5)
memory usage: 112.5+ KB


In [4]:
all_data.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
count,1309.0,1046.0,1309.0,1309.0,1308.0
mean,2.294882,29.881138,0.498854,0.385027,33.295479
std,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.17,0.0,0.0,0.0
25%,2.0,21.0,0.0,0.0,7.8958
50%,3.0,28.0,0.0,0.0,14.4542
75%,3.0,39.0,1.0,0.0,31.275
max,3.0,80.0,8.0,9.0,512.3292


In [5]:
# PClass (apesar de numérico) -> dummies
# Names -> *transformar em títulos* e dummies
# Age -> preencher com média por grupos
# Sex -> Dummies
# SibSp e Parch -> transformar em tamanho da família
# Ticket -> Manter só as letras
# Fare -> preencher nulos com média
# Cabin -> transformar em deck, preencher com moda (U) e dummies
# Embarked -> preencher nulos com a moda e depois dummies

In [6]:
all_data['Title'] = all_data['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
title_map = {
    'Capt':       'Officer',
    'Col':        'Officer',
    'Major':      'Officer',
    'Jonkheer':   'Royalty',
    'Don':        'Royalty',
    'Sir' :       'Royalty',
    'Dr':         'Officer',
    'Rev':        'Officer',
    'the Countess':'Royalty',
    'Dona':       'Royalty',
    'Mme':        'Mrs',
    'Mlle':       'Miss',
    'Ms':         'Mrs',
    'Mr' :        'Mr',
    'Mrs' :       'Mrs',
    'Miss' :      'Miss',
    'Master' :    'Master',
    'Lady' :      'Royalty'
}
all_data['Title'] = all_data['Title'].map(title_map)

In [7]:
def fillAges(row):
    if row['Sex']=='female' and row['Pclass'] == 1:
        if row['Title'] == 'Miss':
            return 30
        elif row['Title'] == 'Mrs':
            return 45
        elif row['Title'] == 'Officer':
            return 49
        elif row['Title'] == 'Royalty':
            return 39

    elif row['Sex']=='female' and row['Pclass'] == 2:
        if row['Title'] == 'Miss':
            return 20
        elif row['Title'] == 'Mrs':
            return 30

    elif row['Sex']=='female' and row['Pclass'] == 3:
        if row['Title'] == 'Miss':
            return 18
        elif row['Title'] == 'Mrs':
            return 31

    elif row['Sex']=='male' and row['Pclass'] == 1:
        if row['Title'] == 'Master':
            return 6
        elif row['Title'] == 'Mr':
            return 41.5
        elif row['Title'] == 'Officer':
            return 52
        elif row['Title'] == 'Royalty':
            return 40

    elif row['Sex']=='male' and row['Pclass'] == 2:
        if row['Title'] == 'Master':
            return 2
        elif row['Title'] == 'Mr':
            return 30
        elif row['Title'] == 'Officer':
            return 41.5

    elif row['Sex']=='male' and row['Pclass'] == 3:
        if row['Title'] == 'Master':
            return 6
        elif row['Title'] == 'Mr':
            return 26
all_data['Age'] = all_data.apply(lambda r : fillAges(r) if np.isnan(r['Age']) else r['Age'], axis=1)

In [8]:
all_data.drop('Name',axis=1,inplace=True)
titles_dummies = pd.get_dummies(all_data['Title'], prefix='Title')
all_data = pd.concat([all_data, titles_dummies],axis=1)
all_data.drop('Title',axis=1,inplace=True)

In [9]:
all_data['Sex'] = all_data['Sex'].map(lambda x: 1 if x == 'male' else 0)

In [10]:
all_data['FamilySize'] = all_data['Parch'] + all_data['SibSp'] + 1 # (+1 o próprio cara)
# introducing other features based on the family size
all_data['Singleton'] = all_data['FamilySize'].map(lambda s : 1 if s == 1 else 0)
all_data['SmallFamily'] = all_data['FamilySize'].map(lambda s : 1 if 2<=s<=4 else 0)
all_data['LargeFamily'] = all_data['FamilySize'].map(lambda s : 1 if 5<=s else 0)
all_data.drop('SibSp', axis=1, inplace=True)
all_data.drop('Parch', axis=1, inplace=True)

In [11]:
all_data['Ticket'] = all_data['Ticket'].map(lambda x: ''.join(filter(str.isalpha, x)))
all_data['Ticket'] = all_data['Ticket'].map(lambda x: x if x else 'XXX')
tickets_dummies = pd.get_dummies(all_data['Ticket'],prefix='Ticket')
all_data = pd.concat([all_data, tickets_dummies],axis=1)
all_data.drop('Ticket',inplace=True,axis=1)

In [12]:
fare_mean = all_data['Fare'].mean()
all_data['Fare'] = all_data['Fare'].fillna(fare_mean)

In [13]:
all_data['Cabin'] = all_data['Cabin'].fillna('U')
all_data['Cabin'] = all_data['Cabin'].map(lambda x: x[0])
cabin_dummies = pd.get_dummies(all_data['Cabin'],prefix='Cabin')
all_data = pd.concat([all_data, cabin_dummies],axis=1)
all_data.drop('Cabin',inplace=True,axis=1)

In [14]:
embarked_mode = all_data['Embarked'].mode()
all_data['Embarked'] = all_data['Embarked'].fillna(embarked_mode)
embarked_dummies = pd.get_dummies(all_data['Embarked'],prefix='Embarked')
all_data = pd.concat([all_data, embarked_dummies],axis=1)
all_data.drop('Embarked',inplace=True,axis=1)

In [15]:
# log(x+1) nas features númericas para obter distribuição de frequência mais próxima da normal

# selecionando features numéricas
numeric_feats = all_data.dtypes[all_data.dtypes != 'object'].index
# calculando skew (assimetria)
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna()))
# filtro por skew maior que 0.75 (perto de zero é normal)
skewed_feats = skewed_feats[skewed_feats > 0.75]
# selecionando índices para normalização
skewed_feats = skewed_feats.index
# normalizando por log(x + 1)
all_data[skewed_feats] = np.log1p(all_data[skewed_feats]) 

In [16]:
scaler = MinMaxScaler()
# ajusta o método aos dados
scaler.fit(all_data)
# transforma os dados
X = scaler.transform(all_data)
# sobrescreve o Data Frame
all_data = pd.DataFrame(X, columns=all_data.columns)

In [17]:
X = all_data[:train.shape[0]]
test = all_data[train.shape[0]:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [18]:
model_instances = [
    (RandomForestClassifier(), 'RandomForestClassifier'),
    (ExtraTreesClassifier(), 'ExtraTreesClassifier'),
    (GradientBoostingClassifier(), 'GradientBoostingClassifier'),
    (LogisticRegression(), 'LogisticRegression'),
    (DecisionTreeClassifier(), 'DecisionTreeClassifier'),
    (KNeighborsClassifier(), 'KNeighborsClassifier'),
    (GaussianNB(), 'GaussianNB'),
    (Perceptron(), 'Perceptron'),
    (SGDClassifier(), 'SGDClassifier'),
    (SVC(), 'SVC'),
    (LinearSVC(), 'LinearSVC'),
    (LGBMClassifier(verbose=0), 'LGBMClassifier'),
    (XGBClassifier(), 'XGBClassifier'),
    (CatBoostClassifier(verbose=False), 'CatBoostClassifier'), 
] 

In [19]:
results = {
    'Model':[],
    'ACC':[]
}

In [20]:
for model, model_name in model_instances:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results['Model'].append(model_name)
    results['ACC'].append(accuracy_score(y_test, y_pred))

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


In [21]:
results = pd.DataFrame(results)
results

Unnamed: 0,Model,ACC
0,RandomForestClassifier,0.825112
1,ExtraTreesClassifier,0.825112
2,GradientBoostingClassifier,0.793722
3,LogisticRegression,0.825112
4,DecisionTreeClassifier,0.816143
5,KNeighborsClassifier,0.820628
6,GaussianNB,0.461883
7,Perceptron,0.798206
8,SGDClassifier,0.820628
9,SVC,0.829596


In [22]:
results_temp = results.sort_values('ACC', ascending=False)
results_temp.iloc[:5]['Model']

13        CatBoostClassifier
9                        SVC
0     RandomForestClassifier
1       ExtraTreesClassifier
3         LogisticRegression
Name: Model, dtype: object

In [23]:
for n in [100,200,300,500,1000]:
    model = RandomForestClassifier(n_estimators=n)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(n, accuracy_score(y_test, y_pred))

100 0.8295964125560538
200 0.8340807174887892
300 0.8295964125560538
500 0.8340807174887892
1000 0.8295964125560538


In [24]:
# Escolha dos melhores parâmetros
randomForest = RandomForestClassifier()
cross_validation = StratifiedKFold(n_splits=5) # n_folds deve ser escolhido de forma precisa
parameter_grid = {
     'max_depth' : [10,20,30],
     'n_estimators': [100, 300,500],
     'criterion': ['gini','entropy',],
}
grid_search = GridSearchCV(
    randomForest,
    param_grid=parameter_grid,
    cv=cross_validation)

grid_search.fit(X_train, y_train)

print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

Best score: 0.8398720682302772
Best parameters: {'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 500}


In [25]:
random_forest = RandomForestClassifier(
    n_estimators=500, # tunado
    criterion='entropy', # tunado
    max_depth=10, # tunado
)

random_forest.fit(X, y)
y_pred = random_forest.predict(test)

In [26]:
sample_submission = pd.read_csv('sample_submission.csv',index_col=0)
sample_submission['Survived'] = y_pred
sample_submission.to_csv('random_forest_tunado.csv')