In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

import lightgbm as lgb

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train_test = pd.concat([train, test], axis=0)

In [4]:
train.drop(columns=['Name', 'Cabin', 'Ticket'], inplace=True)
test.drop(columns=['Name', 'Cabin','Ticket'], inplace=True)

In [5]:
#quickly fill na values
def fill_na_benchmark(df_train, df_test):
    #fill missing values with most frequent values for categorical
    numeric_cols = df_train.select_dtypes(exclude=['object']).drop(columns=['Survived']).columns
    cat_cols = df_train.select_dtypes(include=['object']).columns
    print(cat_cols)
    print(numeric_cols)
    imputer = SimpleImputer(strategy='most_frequent')
    imputer.fit(df_train[cat_cols])
    df_train[cat_cols] = imputer.transform(df_train[cat_cols])
    df_test[cat_cols] = imputer.transform(df_test[cat_cols])
    
    # ordinal encoder, transform categorical labels
    ode = OrdinalEncoder()
    ode.fit(df_train[cat_cols])
    df_train[cat_cols] = ode.transform(df_train[cat_cols])
    df_test[cat_cols] = ode.transform(df_test[cat_cols])
    
    ## fill missing values with mean for numeric
    to_fill = {}
    for col in numeric_cols:
        to_fill[col] = df_train[col].mean()    
    df_train.fillna(value=to_fill, inplace=True)
    df_test.fillna(value=to_fill, inplace=True)
    
    return df_train, df_test, to_fill

In [6]:
train, test, to_fill = fill_na_benchmark(train, test)

Index(['Sex', 'Embarked'], dtype='object')
Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')


### Lightgbm Benchmark

In [7]:
# Prapre train and test

X_train, y_train = train.drop(columns=['Survived']), train['Survived']

params = {'boosting_type': 'gbdt', 'objective': 'binary', 'n_estimators':200,
              'num_leaves': 100, 'learning_rate': 0.01, 'max_bin': 512, 
              'subsample_for_bin': 200, 'subsample': 1, 'subsample_freq': 1,
              'colsample_bytree': 0.8, 'reg_alpha': 5, 'reg_lambda': 10, 'metric': 'binary_logloss'}


lgbm = lgb.LGBMClassifier(**params)
lgbm.fit(X_train, y_train, categorical_feature=['Sex', 'Embarked'], eval_set=[(X_train, y_train)], early_stopping_rounds=20)
y_pred = lgbm.predict(test)

[1]	training's binary_logloss: 0.662757
[2]	training's binary_logloss: 0.659737
[3]	training's binary_logloss: 0.656931
[4]	training's binary_logloss: 0.653793
[5]	training's binary_logloss: 0.65068
[6]	training's binary_logloss: 0.647854
[7]	training's binary_logloss: 0.64484
[8]	training's binary_logloss: 0.643536
[9]	training's binary_logloss: 0.640815
[10]	training's binary_logloss: 0.637913
[11]	training's binary_logloss: 0.63506
[12]	training's binary_logloss: 0.632468
[13]	training's binary_logloss: 0.629708
[14]	training's binary_logloss: 0.627161
[15]	training's binary_logloss: 0.624515
[16]	training's binary_logloss: 0.622218
[17]	training's binary_logloss: 0.619625
[18]	training's binary_logloss: 0.617075
[19]	training's binary_logloss: 0.614885
[20]	training's binary_logloss: 0.612406
[21]	training's binary_logloss: 0.611348
[22]	training's binary_logloss: 0.609143
[23]	training's binary_logloss: 0.60675
[24]	training's binary_logloss: 0.604395
[25]	training's binary_loglos



In [8]:
sub = pd.DataFrame(test['PassengerId'])
sub['Survived'] = y_pred
sub.to_csv('Submission_benchmark.csv',index=False)

### Score: 0.77033

### Try Rf

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestClassifier()

param_grid_rf = {'n_estimators': np.arange(50, 500, 50),
                'max_depth': np.arange(4, 14, 2),
                'min_samples_split': np.arange(2, 20),
                'max_features':[0.6, 0.7, 0.8, 0.9]}

rf_rand = RandomizedSearchCV(rf, param_distributions=param_grid_rf, n_jobs=3, cv=5)
rf_rand.fit(X_train, y_train)
y_pred = rf_rand.predict(test)

sub = pd.DataFrame(test['PassengerId'])
sub['Survived'] = y_pred
sub.to_csv('Submission_benchmark_rf.csv',index=False)