In [1]:
import numpy as np
import pandas as pd
import os
import pickle
import gc

# 分布確認
#import pandas_profiling as pdp

# 可視化
import matplotlib.pyplot as plt

# 前処理
from sklearn.preprocessing import StandardScaler,MinMaxScaler,LabelEncoder,OneHotEncoder

# モデリング
from sklearn.model_selection import train_test_split,KFold,StratifiedGroupKFold
from sklearn.metrics import accuracy_score,roc_auc_score,confusion_matrix
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

# matplotlib で日本語表示したい場合
!pip install japanize-matplotlib
import japanize_matplotlib
%matplotlib inline



DEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063


In [2]:
df_train = pd.read_csv("./data/train.csv")
df_test = pd.read_csv("./data/test.csv")
display(df_train.head())
display(df_test.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [3]:
display(df_train.shape)
print("レコード数： ",len(df_train))
print("カラム数： ",len(df_train.columns))

(891, 12)

レコード数：  891
カラム数：  12


In [4]:
print(df_train.info())
print(df_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pcl

In [5]:
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## データセットの作成

In [6]:
x_train,y_train,id_train = df_train[["Pclass","Fare"]],df_train[["Survived"]],df_train[["PassengerId"]]
display(x_train.shape,y_train.shape,id_train.shape)
display(x_train.head())
display(y_train.head())
display(id_train.head())


(891, 2)

(891, 1)

(891, 1)

Unnamed: 0,Pclass,Fare
0,3,7.25
1,1,71.2833
2,3,7.925
3,1,53.1
4,3,8.05


Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


Unnamed: 0,PassengerId
0,1
1,2
2,3
3,4
4,5


## ホールドアウト検証の実行

In [9]:
x_tr,x_va,y_tr,y_va = train_test_split(x_train,y_train,test_size=0.2,shuffle=True,stratify=y_train,random_state=123)

print(x_tr.shape,y_tr.shape)
print(x_va.shape,y_va.shape)
print("y_train:{:.3f},y_tr:{:.3f},y_va:{:.3f}".format(y_train["Survived"].mean(),y_tr["Survived"].mean(),y_va["Survived"].mean()))

(712, 2) (712, 1)
(179, 2) (179, 1)
y_train:0.384,y_tr:0.383,y_va:0.385


## クロスバリデーションの実行

In [10]:
n_split = 5
cv = list(StratifiedGroupKFold(n_splits=n_split,shuffle=True,random_state=123).split(x_train,y_train))
#print(cv)
for nfold in np.arange(n_split):
    print("-"*20,nfold,"-"*20)
    idx_tr,idx_va = cv[nfold][0],cv[nfold][1]
    x_tr,y_tr = x_train.loc[idx_tr,:],y_train.loc[idx_tr,:]
    x_va,y_va = x_train.loc[idx_va,:],y_train.loc[idx_va,:]
    print(x_tr.shape,y_tr.shape)
    print(x_va.shape,y_va.shape)
    print("y_train:{:.3f},y_tr:{:.3f},y_va:{:.3f}".format(y_train["Survived"].mean(),y_tr["Survived"].mean(),y_va["Survived"].mean(),))

-------------------- 0 --------------------
(891, 2) (891, 1)
(0, 2) (0, 1)
y_train:0.384,y_tr:0.384,y_va:nan
-------------------- 1 --------------------
(891, 2) (891, 1)
(0, 2) (0, 1)
y_train:0.384,y_tr:0.384,y_va:nan
-------------------- 2 --------------------
(890, 2) (890, 1)
(1, 2) (1, 1)
y_train:0.384,y_tr:0.384,y_va:0.000
-------------------- 3 --------------------
(891, 2) (891, 1)
(0, 2) (0, 1)
y_train:0.384,y_tr:0.384,y_va:nan
-------------------- 4 --------------------
(891, 2) (891, 1)
(0, 2) (0, 1)
y_train:0.384,y_tr:0.384,y_va:nan


## モデル学習

In [None]:
# ハイパーパラメータ
