In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import pandas_profiling as pp


warnings.filterwarnings('ignore')
%matplotlib inline


print(os.curdir)
print(os.listdir("input"))


.
['test.csv', 'train.csv', 'gender_submission.csv']


In [2]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input//test.csv')
data = pd.concat([train, test], sort=False)
data


# pclass： 旅客クラス（1＝1等、2＝2等、3＝3等）。裕福さの目安となる
# name： 乗客の名前
# sex： 性別（male＝男性、female＝女性）
# age： 年齢。一部の乳児は小数値
# sibsp： タイタニック号に同乗している兄弟（Siblings）や配偶者（Spouses）の数
# parch： タイタニック号に同乗している親（Parents）や子供（Children）の数
# ticket： チケット番号
# fare： 旅客運賃
# cabin： 客室番号
# embarked： 出港地（C＝Cherbourg：シェルブール、Q＝Queenstown：クイーンズタウン、S＝Southampton：サウサンプトン）
# boat： 救命ボート番号
# body： 遺体収容時の識別番号
# home.dest： 自宅または目的地
# survived：生存状況（0＝死亡、1＝生存）。通常はこの数値が目的変数として使われる


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [5]:
data['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
data['Embarked'].fillna('S', inplace=True)
data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
data['Fare'].fillna(np.mean(data['Fare']), inplace=True)
age_avg = data['Age'].mean()
age_std = data['Age'].std()
data['Age'].fillna(np.random.randint(age_avg - age_std, age_avg + age_std), inplace=True)

data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.2500,,0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1
2,3,1.0,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.9250,,0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1000,C123,0
4,5,0.0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.0500,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,,3,"Spector, Mr. Woolf",0,22.0,0,0,A.5. 3236,8.0500,,0
414,1306,,1,"Oliva y Ocana, Dona. Fermina",1,39.0,0,0,PC 17758,108.9000,C105,1
415,1307,,3,"Saether, Mr. Simon Sivertsen",0,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,0
416,1308,,3,"Ware, Mr. Frederick",0,22.0,0,0,359309,8.0500,,0


In [6]:
delete_columns = ['Name', 'PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin']
data.drop(delete_columns, axis=1, inplace=True)

train = data[:len(train)]
test = data[len(train):]

train
# test

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0.0,3,0,22.0,7.2500,0
1,1.0,1,1,38.0,71.2833,1
2,1.0,3,1,26.0,7.9250,0
3,1.0,1,1,35.0,53.1000,0
4,0.0,3,0,35.0,8.0500,0
...,...,...,...,...,...,...
886,0.0,2,0,27.0,13.0000,0
887,1.0,1,1,19.0,30.0000,0
888,0.0,3,1,22.0,23.4500,0
889,1.0,1,0,26.0,30.0000,1


In [8]:
y_train = train['Survived']
X_train = train.drop('Survived', axis=1)
X_test = test.drop(['Survived'], axis=1)


print(X_train.head())
print(y_train.head())
print(X_test.head())

print('---------------------------')
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=0, stratify=y_train)
print(X_train.head())
print(X_valid.head())
print(y_train.head())
print(y_valid.head())


   Pclass  Sex   Age     Fare  Embarked
0       3    0  22.0   7.2500         0
1       1    1  38.0  71.2833         1
2       3    1  26.0   7.9250         0
3       1    1  35.0  53.1000         0
4       3    0  35.0   8.0500         0
0    0.0
1    1.0
2    1.0
3    1.0
4    0.0
Name: Survived, dtype: float64
   Pclass  Sex   Age     Fare  Embarked
0       3    0  34.5   7.8292         2
1       3    1  47.0   7.0000         0
2       2    0  62.0   9.6875         2
3       3    0  27.0   8.6625         0
4       3    1  22.0  12.2875         0
---------------------------
     Pclass  Sex   Age     Fare  Embarked
231       3    0  29.0   7.7750         0
836       3    0  21.0   8.6625         0
639       3    0  22.0  16.1000         0
389       2    1  17.0  12.0000         1
597       3    0  49.0   0.0000         0
     Pclass  Sex   Age     Fare  Embarked
421       3    0  21.0   7.7333         2
618       2    1   4.0  39.0000         0
116       3    0  70.5   7.7500       

In [11]:
import lightgbm as lgb

categorical_features = ['Embarked', 'Pclass', 'Sex']
lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train, categorical_feature=categorical_features)



params = {
    'objective': 'binary',
    'max_bin': 300,
    'learning_rate': 0.05,
    'num_leaves': 40,
}

model = lgb.train(params, lgb_train,
                  valid_sets=[lgb_train, lgb_eval],
                  verbose_eval=10,
                  num_boost_round=1000,
                  early_stopping_rounds=10)
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
y_pred_rounded = (y_pred > 0.5).astype(int)
y_pred

[LightGBM] [Info] Number of positive: 239, number of negative: 384
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 181
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383628 -> initscore=-0.474179
[LightGBM] [Info] Start training from score -0.474179
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.503703	valid_1's binary_logloss: 0.530039
[20]	training's binary_logloss: 0.427898	valid_1's binary_logloss: 0.47452
[30]	training's binary_logloss: 0.382678	valid_1's binary_logloss: 0.452398
[40]	training's binary_logloss: 0.354986	valid_1's binary_logloss: 0.445539
[50]	training's binary_logloss: 0.333408	valid_1's binary_logloss: 0.441553
[60]	training's binary_logloss: 0.310698	valid_1's binary_logloss: 0.436485
[70]	training's binary_logloss: 0.2912

array([0.01688388, 0.44277112, 0.10040569, 0.05633506, 0.4161583 ,
       0.24634888, 0.48421271, 0.08042872, 0.74166364, 0.02012128,
       0.03513836, 0.20114795, 0.96874954, 0.02438937, 0.9703036 ,
       0.88553326, 0.09104074, 0.18067628, 0.53195625, 0.44052293,
       0.36642599, 0.35326115, 0.95830961, 0.48677702, 0.94836621,
       0.0924927 , 0.95907887, 0.18067628, 0.42790454, 0.086052  ,
       0.03184588, 0.07485369, 0.39175753, 0.19498479, 0.70933476,
       0.19226748, 0.26064341, 0.26731961, 0.04670679, 0.43817114,
       0.21273243, 0.63036957, 0.01474056, 0.94297833, 0.97301901,
       0.12702057, 0.38274055, 0.10131681, 0.96743888, 0.59352725,
       0.44358916, 0.22534736, 0.89337762, 0.95898073, 0.35439635,
       0.08638861, 0.01811675, 0.11231344, 0.05961495, 0.96401357,
       0.06591023, 0.19847839, 0.08393832, 0.75054241, 0.59952595,
       0.89142485, 0.7146242 , 0.11049222, 0.5958794 , 0.94873707,
       0.77999614, 0.03513836, 0.43704481, 0.52728969, 0.96570

In [15]:
# output raw
sub = pd.read_csv('input/gender_submission.csv')
sub['Survived'] = y_pred
sub.to_csv('output/submission/submission_32-lightgbm-change-hyper-param_raw.csv', index=False)

# output rounded
sub = pd.read_csv('input/gender_submission.csv')
sub['Survived'] = list(map(int, y_pred_rounded))
sub.to_csv('output/submission/submission_32-lightgbm-change-hyper-param.csv', index=False)