In [None]:
!pip install -q japanize_matplotlib

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import japanize_matplotlib

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
plt.style.use('bmh')

plt.rcParams['figure.figsize'] = 10, 10


In [None]:
df_train = pd.read_csv('/content/train.csv')
df_test = pd.read_csv('/content/test.csv')

train_mid = df_train.copy()
test_mid = df_test.copy()

# 後に分割するためフラグを振る
train_mid['train_or_test'] = 'train' #学習データフラグを追加
test_mid['train_or_test'] = 'test' #テストデータフラグを追加
test_mid['Survived'] = 9 #テストにSurvivedカラムを仮置き

df_all = pd.concat(
    [
        train_mid,
        test_mid
    ],
    sort=False,
    axis=0
).reset_index(drop=True)

In [None]:
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
df_train.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Braund, Mr. Owen Harris",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [None]:
df_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [None]:
# 欠損値の確認
df_all.isnull().sum()

PassengerId         0
Survived            0
Pclass              0
Name                0
Sex                 0
Age               263
SibSp               0
Parch               0
Ticket              0
Fare                1
Cabin            1014
Embarked            2
train_or_test       0
dtype: int64

In [None]:
# Embarkedには最頻値を代入
df_all = df_all.fillna({'Embarked':df_all['Embarked'].mode()[0]})
# Fareには中央値を代入
df_all = df_all.fillna({'Fare':df_all['Fare'].median()})
# 欠損値補完がされたかを確認
df_all.isnull().sum()

PassengerId         0
Survived            0
Pclass              0
Name                0
Sex                 0
Age               263
SibSp               0
Parch               0
Ticket              0
Fare                0
Cabin            1014
Embarked            0
train_or_test       0
dtype: int64

In [None]:
# NameにMissが含まれているAgeの欠損値に15を追加
# df_all['honorific'] = df_all['Name'].map(lambda x: x.split(', ')[1].split('. ')[0])
# condition = (df_all['honorific'] == 'Miss') & (df_all['Age'].isna())
# df_all.loc[condition, 'Age'] = 15
# print(df_all.isnull().sum())
# 恣意的であるため却下

In [None]:
df_all.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,train_or_test
count,1309.0,1309.0,1309.0,1309,1309,1046.0,1309.0,1309.0,1309,1309.0,295,1309,1309
unique,,,,1307,2,,,,929,,186,3,2
top,,,,"Connolly, Miss. Kate",male,,,,CA. 2343,,C23 C25 C27,S,train
freq,,,,2,843,,,,11,,6,916,891
mean,655.0,3.135218,2.294882,,,29.881138,0.498854,0.385027,,33.281086,,,
std,378.020061,4.038525,0.837836,,,14.413493,1.041658,0.86556,,51.7415,,,
min,1.0,0.0,1.0,,,0.17,0.0,0.0,,0.0,,,
25%,328.0,0.0,2.0,,,21.0,0.0,0.0,,7.8958,,,
50%,655.0,1.0,3.0,,,28.0,0.0,0.0,,14.4542,,,
75%,982.0,9.0,3.0,,,39.0,1.0,0.0,,31.275,,,


In [None]:
df_all['family_name'] = df_all['Name'].map(lambda y: y.split(', ')[0])
df_all.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,train_or_test,family_name
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train,Braund
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train,Cumings
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,train,Heikkinen
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train,Futrelle
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,train,Allen


In [None]:
# family_nameは階級を表すことができる？
df_all['family_name'].value_counts()

family_name
Andersson    11
Sage         11
Goodwin       8
Asplund       8
Davies        7
             ..
Milling       1
Maisner       1
Goncalves     1
Campbell      1
Saether       1
Name: count, Length: 875, dtype: int64

In [None]:
# Familysizeを定義
df_all['FamilySize'] = df_all['Parch'] + df_all['SibSp'] + 1
# FamilySizeを4種類に離散化
df_all['FamilySize_bin'] = 'big'
df_all.loc[df_all['FamilySize']==1,'FamilySize_bin'] = 'alone'
df_all.loc[(df_all['FamilySize']>=2) & (df_all['FamilySize']<=4),'FamilySize_bin'] = 'small'
df_all.loc[(df_all['FamilySize']>=5) & (df_all['FamilySize']<=7),'FamilySize_bin'] = 'mediam'

In [None]:
# 敬称(honorific)の加工
df_all['honorific'] = df_all['Name'].map(lambda x: x.split(', ')[1].split('. ')[0])
df_all['honorific'].replace(['Col','Dr', 'Rev'], 'Rare',inplace=True) #少数派の敬称を統合
df_all['honorific'].replace('Mlle', 'Miss',inplace=True) #Missに統合
df_all['honorific'].replace('Ms', 'Miss',inplace=True) #Missに統合

In [None]:
# Cabinの頭文字
df_all['Cabin_ini'] = df_all['Cabin'].map(lambda x:str(x)[0])
df_all['Cabin_ini'].replace(['G','T'], 'Rare',inplace=True) #少数派のCabin_iniを統合

In [None]:
# Ticket頻度
df_all.loc[:, 'TicketFreq'] = df_all.groupby(['Ticket'])['PassengerId'].transform('count')

In [None]:
# Fareの分割
bins = [-1, 15, 60, 600]
df_all['Fare_bin'] = pd.cut(df_all['Fare'], bins=bins)

In [None]:
# Cabinの頭文字
df_all['Cabin_ini'] = df_all['Cabin'].map(lambda x:str(x)[0])
df_all['Cabin_ini'].replace(['G','T'], 'Rare',inplace=True) #少数派のCabin_iniを統合

In [None]:
# Ageを分割
bins = [0, 15, 60, 80]
df_all['Age_bins'] = pd.cut(df_all['Age'], bins=bins)

In [None]:
# 不要なカラムを削除する
df_all.drop(['PassengerId', 'Name', 'Fare', 'Age', 'Cabin', 'family_name', 'SibSp', 'Parch', 'Ticket', 'FamilySize'], axis=1, inplace=True)

In [None]:
df_all

Unnamed: 0,Survived,Pclass,Sex,Embarked,train_or_test,FamilySize_bin,honorific,Cabin_ini,TicketFreq,Fare_bin,Age_bins
0,0,3,male,S,train,small,Mr,n,1,"(-1, 15]","(15.0, 60.0]"
1,1,1,female,C,train,small,Mrs,C,2,"(60, 600]","(15.0, 60.0]"
2,1,3,female,S,train,alone,Miss,n,1,"(-1, 15]","(15.0, 60.0]"
3,1,1,female,S,train,small,Mrs,C,2,"(15, 60]","(15.0, 60.0]"
4,0,3,male,S,train,alone,Mr,n,1,"(-1, 15]","(15.0, 60.0]"
...,...,...,...,...,...,...,...,...,...,...,...
1304,9,3,male,S,test,alone,Mr,n,1,"(-1, 15]",
1305,9,1,female,C,test,alone,Dona,C,3,"(60, 600]","(15.0, 60.0]"
1306,9,3,male,S,test,alone,Mr,n,1,"(-1, 15]","(15.0, 60.0]"
1307,9,3,male,S,test,alone,Mr,n,1,"(-1, 15]",


In [None]:
df_all.isnull().sum()

Survived            0
Pclass              0
Sex                 0
Embarked            0
train_or_test       0
FamilySize_bin      0
honorific           0
Cabin_ini           0
TicketFreq          0
Fare_bin            0
Age_bins          263
dtype: int64

In [None]:
# Age_binの予測モデル作成
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# カテゴリカル変数の処理しagepredのデータセットを用意
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
# labelencordingするカラムを抽出
cols = [col for col in df_all.columns
        if df_all[col].dtype != 'int']
# train_or_testをdrop
# cols.remove('train_or_test')

In [None]:
cols

['Sex',
 'Embarked',
 'train_or_test',
 'FamilySize_bin',
 'honorific',
 'Cabin_ini',
 'Fare_bin',
 'Age_bins']

In [None]:
for col in cols:
  df_all[col] = le.fit_transform(df_all[col])

In [None]:
df_all

Unnamed: 0,Survived,Pclass,Sex,Embarked,train_or_test,FamilySize_bin,honorific,Cabin_ini,TicketFreq,Fare_bin,Age_bins
0,0,3,1,2,1,3,9,7,1,0,1
1,1,1,0,0,1,3,10,2,2,2,1
2,1,3,0,2,1,0,7,7,1,0,1
3,1,1,0,2,1,3,10,2,2,1,1
4,0,3,1,2,1,0,9,7,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...
1304,9,3,1,2,0,0,9,7,1,0,3
1305,9,1,0,0,0,0,2,2,3,2,1
1306,9,3,1,2,0,0,9,7,1,0,1
1307,9,3,1,2,0,0,9,7,1,0,3


In [None]:
# 事前に設定したフラグでデータを分離
age_train = df_all.query('train_or_test == 1')
age_test = df_all.query('train_or_test == 0')
age_train = age_train.drop(['train_or_test'], axis=1)
age_test = age_test.drop(['train_or_test'], axis=1)
age_train.describe()

Unnamed: 0,Survived,Pclass,Sex,Embarked,FamilySize_bin,honorific,Cabin_ini,TicketFreq,Fare_bin,Age_bins
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.647587,1.536476,1.107744,8.590348,5.943883,2.121212,0.622896,1.328844
std,0.486592,0.836071,0.47799,0.791503,1.399702,1.253972,2.061197,1.79666,0.713665,0.897342
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,0.0,2.0,0.0,1.0,0.0,7.0,7.0,1.0,0.0,1.0
50%,0.0,3.0,1.0,2.0,0.0,9.0,7.0,1.0,0.0,1.0
75%,1.0,3.0,1.0,2.0,3.0,9.0,7.0,3.0,1.0,1.0
max,1.0,3.0,1.0,2.0,3.0,13.0,7.0,11.0,2.0,3.0


In [None]:
# 欠損値に3が付与されている
age_train['Age_bins'].value_counts()

Age_bins
1    609
3    177
0     83
2     22
Name: count, dtype: int64

In [None]:
# Age_binが欠損値，そうでないものに切り分ける
age_train_isnull = age_train[age_train['Age_bins']==3]
age_train_full = age_train[age_train['Age_bins']!=3]
age_test_isnull = age_test[age_test['Age_bins']==3]
age_test_full = age_test[age_test['Age_bins']!=3]

# データ整理
# Ageを予測するための教師データ
age_pred_x = age_train_full.drop(['Survived'], axis=1)
age_pred_t = age_train_full['Age_bins']
# Ageを予測するためのテストデータ
age_train_isnull_d = age_train_isnull.drop(['Survived', 'Age_bins'], axis=1)
age_test_isnull_d = age_test_isnull.drop({'Survived', 'Age_bins'}, axis=1)

In [None]:
age_pred_x.head(10)

Unnamed: 0,Pclass,Sex,Embarked,FamilySize_bin,honorific,Cabin_ini,TicketFreq,Fare_bin,Age_bins
0,3,1,2,3,9,7,1,0,1
1,1,0,0,3,10,2,2,2,1
2,3,0,2,0,7,7,1,0,1
3,1,0,2,3,10,2,2,1,1
4,3,1,2,0,9,7,1,0,1
6,1,1,2,0,9,4,2,1,1
7,3,1,2,2,6,7,5,1,0
8,3,0,2,3,10,7,3,0,1
9,2,0,0,3,10,7,2,1,0
10,3,0,2,3,7,6,3,1,0


In [None]:
!pip install pandas==2.0.3




In [None]:
!pip install numpy matplotlib seaborn altair



In [None]:
!pip install scikit-learn==1.0.2

Collecting scikit-learn==1.0.2
  Using cached scikit_learn-1.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.5 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.4.2
    Uninstalling scikit-learn-1.4.2:
      Successfully uninstalled scikit-learn-1.4.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 1.8.0 requires scikit-learn>=1.2.2, but you have scikit-learn 1.0.2 which is incompatible.
pycaret 3.3.2 requires scikit-learn>1.4.0, but you have scikit-learn 1.0.2 which is incompatible.[0m[31m
[0mSuccessfully installed scikit-learn-1.0.2


In [None]:
!pip install pycaret



In [None]:
#!apt-get update
#!apt-get install -y build-essential

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:6 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Hit:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,392 kB]
Fetched 1,652 kB in 4s (436 kB/s)
Reading package lists... Done
Reading package lists... Done
Building dependency tree...

In [None]:
from pycaret.classification import *

In [None]:
# データの前処理
exp = setup(
    data = age_pred_x, target='Age_bins', train_size=0.8, session_id=0
)

Unnamed: 0,Description,Value
0,Session id,0
1,Target,Age_bins
2,Target type,Multiclass
3,Original data shape,"(714, 9)"
4,Transformed data shape,"(714, 9)"
5,Transformed train set shape,"(571, 9)"
6,Transformed test set shape,"(143, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


In [None]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.9334,0.8758,0.9334,0.9087,0.9195,0.7156,0.7267,0.077
et,Extra Trees Classifier,0.9265,0.8348,0.9265,0.9037,0.9141,0.6933,0.7011,0.282
rf,Random Forest Classifier,0.9247,0.8722,0.9247,0.9039,0.913,0.6841,0.6939,0.232
gbc,Gradient Boosting Classifier,0.9247,0.0,0.9247,0.9007,0.9114,0.6734,0.6839,0.459
xgboost,Extreme Gradient Boosting,0.923,0.8782,0.923,0.9025,0.9119,0.6766,0.6841,0.105
lightgbm,Light Gradient Boosting Machine,0.923,0.8876,0.923,0.905,0.9127,0.6797,0.688,1.534
dt,Decision Tree Classifier,0.9212,0.8366,0.9212,0.9014,0.9103,0.6786,0.6847,0.036
lr,Logistic Regression,0.9176,0.0,0.9176,0.8876,0.8994,0.6027,0.6245,0.868
lda,Linear Discriminant Analysis,0.9158,0.0,0.9158,0.89,0.9014,0.637,0.6503,0.063
ridge,Ridge Classifier,0.9054,0.0,0.9054,0.8692,0.8809,0.5012,0.5373,0.036


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [None]:
clf_pycaret = create_model(best_model, fold=5)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9565,0.9187,0.9565,0.9239,0.9395,0.8174,0.8326
1,0.9211,0.9101,0.9211,0.903,0.9111,0.6778,0.6797
2,0.9298,0.8387,0.9298,0.9041,0.9165,0.6793,0.6888
3,0.9035,0.8106,0.9035,0.9022,0.8939,0.5756,0.5868
4,0.9386,0.8607,0.9386,0.9088,0.9226,0.7569,0.7605
Mean,0.9299,0.8678,0.9299,0.9084,0.9167,0.7014,0.7097
Std,0.0177,0.0414,0.0177,0.0081,0.0149,0.0817,0.0826


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
tuned_model = tune_model(clf_pycaret, fold=5)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9565,0.9286,0.9565,0.9239,0.9395,0.8174,0.8326
1,0.9474,0.9458,0.9474,0.922,0.9342,0.7595,0.7701
2,0.9386,0.8656,0.9386,0.9132,0.9247,0.7108,0.7266
3,0.886,0.8003,0.886,0.8489,0.8653,0.4665,0.4872
4,0.9474,0.8939,0.9474,0.914,0.9304,0.7743,0.7849
Mean,0.9352,0.8868,0.9352,0.9044,0.9188,0.7057,0.7203
Std,0.0252,0.0514,0.0252,0.0281,0.0272,0.1243,0.1214


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:
predict_model(tuned_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.9441,0,0.9441,0.9174,0.9299,0.7501,0.7641


Unnamed: 0,Pclass,Sex,Embarked,FamilySize_bin,honorific,Cabin_ini,TicketFreq,Fare_bin,Age_bins,prediction_label,prediction_score
316,2,0,2,3,10,7,2,1,1,1,1.0000
108,3,1,2,0,9,7,1,0,1,1,1.0000
665,2,1,2,3,9,7,7,2,1,1,0.6216
865,2,0,2,0,10,7,1,0,1,1,0.8919
835,1,0,0,3,7,4,3,2,1,1,0.8108
...,...,...,...,...,...,...,...,...,...,...,...
149,2,1,2,0,11,7,1,0,1,1,0.8919
577,1,0,2,3,10,4,2,1,1,1,1.0000
754,2,0,2,3,10,7,5,2,1,1,0.9730
225,3,1,2,0,9,7,1,0,1,1,1.0000


In [None]:
params = tuned_model.get_params()
params

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'euclidean',
 'metric_params': None,
 'n_jobs': -1,
 'n_neighbors': 37,
 'p': 2,
 'weights': 'uniform'}

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
# RFECVはKNeighborsClassifierを用いて特徴量選定ができない
# 2番手のExtraTreesClassifierを用いる
clf_pycaret_2 = create_model('et', fold=5)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9304,0.8836,0.9304,0.9304,0.9304,0.7447,0.7447
1,0.9211,0.8424,0.9211,0.903,0.9111,0.6778,0.6797
2,0.9298,0.8245,0.9298,0.9041,0.9165,0.6793,0.6888
3,0.9123,0.7728,0.9123,0.8781,0.8939,0.6017,0.6202
4,0.9211,0.8877,0.9211,0.8978,0.9068,0.7026,0.7069
Mean,0.9229,0.8422,0.9229,0.9027,0.9117,0.6812,0.6881
Std,0.0067,0.0422,0.0067,0.0167,0.012,0.0465,0.0406


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
# チューニング
tuned_model_2 = tune_model(clf_pycaret_2, fold=5)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9565,0.9275,0.9565,0.9239,0.9395,0.8174,0.8326
1,0.9561,0.9447,0.9561,0.932,0.9428,0.7934,0.8111
2,0.9298,0.8781,0.9298,0.9044,0.9148,0.6589,0.6815
3,0.8947,0.8349,0.8947,0.8611,0.8692,0.4584,0.513
4,0.9386,0.8737,0.9386,0.9061,0.922,0.7437,0.7504
Mean,0.9352,0.8918,0.9352,0.9055,0.9177,0.6944,0.7177
Std,0.0227,0.0396,0.0227,0.0245,0.0264,0.1298,0.1151


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:
# パラメータを保存
params_2 = tuned_model_2.get_params()
params_2

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': {},
 'criterion': 'entropy',
 'max_depth': 11,
 'max_features': 'log2',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0002,
 'min_samples_leaf': 5,
 'min_samples_split': 5,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 20,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

In [None]:
from sklearn.feature_selection import RFECV
age_pred_x = age_pred_x.drop(['Age_bins'], axis=1)

In [None]:
age_train_x_val, x_test, age_train_t_val, t_test = train_test_split(age_pred_x, age_pred_t, test_size=0.4, random_state=0)

In [None]:
age_train_t_val.info()

<class 'pandas.core.series.Series'>
Index: 428 entries, 165 to 856
Series name: Age_bins
Non-Null Count  Dtype
--------------  -----
428 non-null    int64
dtypes: int64(1)
memory usage: 6.7 KB


In [None]:
age_train_x_val = age_train_x_val
age_train_x_val.info()

<class 'pandas.core.frame.DataFrame'>
Index: 428 entries, 165 to 856
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   Pclass          428 non-null    int64
 1   Sex             428 non-null    int64
 2   Embarked        428 non-null    int64
 3   FamilySize_bin  428 non-null    int64
 4   honorific       428 non-null    int64
 5   Cabin_ini       428 non-null    int64
 6   TicketFreq      428 non-null    int64
 7   Fare_bin        428 non-null    int64
dtypes: int64(8)
memory usage: 30.1 KB


In [None]:
# 2番目に精度の高いExtraTreesClassifierで特徴量選定
estimator = ExtraTreesClassifier(**params_2)
rfecv = RFECV(estimator, step=1, cv=5, scoring='accuracy')
rfecv.fit(age_train_x_val, age_train_t_val)

In [None]:
# 選択された特徴量で新しいデータセットを作成
# 全部とってきとるやんけ
selected_features = age_train_x_val.columns[rfecv.get_support()]

# 選択された特徴量で新しいデータセットを作成
x_agepred_train_selected = age_train_x_val[selected_features]
x_agepred_test_selected = x_test[selected_features]
print(selected_features)

Index(['Pclass', 'FamilySize_bin', 'honorific', 'TicketFreq', 'Fare_bin'], dtype='object')


In [None]:
# KNeighborsClassifierはRFECVが適用できない
# KNeighborsClassifierモデルをトレーニング
knn_model = KNeighborsClassifier(**params)
knn_model.fit(x_agepred_train_selected, age_train_t_val)

In [None]:
# 予測する
y_pred = knn_model.predict(x_agepred_test_selected)
accuracy = accuracy_score(t_test, y_pred)
accuracy

0.9230769230769231

In [None]:
# trainの欠損値を補完
df_age_pred_train = knn_model.predict(age_train_isnull[selected_features])
df_age_pred_test = knn_model.predict(age_test_isnull[selected_features])

age_train_isnull.loc[:, 'Age_bins'] = np.argmax(df_age_pred_train)
age_test_isnull.loc[:, 'Age_bins'] = np.argmax(df_age_pred_test)

# 補完が完了した新たなデータフレームを作成
df_train_comp = pd.concat([age_train_full, age_train_isnull], axis=0).sort_index()
df_test_comp = pd.concat([age_test_full, age_test_isnull], axis=0).sort_index()

df_train_comp.isnull().sum()

Survived          0
Pclass            0
Sex               0
Embarked          0
FamilySize_bin    0
honorific         0
Cabin_ini         0
TicketFreq        0
Fare_bin          0
Age_bins          0
dtype: int64

In [None]:
# オーバーサンプリング
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=0, k_neighbors=10)

x = df_train_comp.drop('Survived', axis=1)
t = df_train_comp['Survived']
x_oversampled, t_oversampled = smote.fit_resample(x, t)

# 補完データに適用
df_train_oversampled = pd.DataFrame(x_oversampled, columns=x.columns)
df_train_oversampled['Survived'] = t_oversampled
df_train_oversampled

Unnamed: 0,Pclass,Sex,Embarked,FamilySize_bin,honorific,Cabin_ini,TicketFreq,Fare_bin,Age_bins,Survived
0,3,1,2,3,9,7,1,0,1,0
1,1,0,0,3,10,2,2,2,1,1
2,3,0,2,0,7,7,1,0,1,1
3,1,0,2,3,10,2,2,1,1,1
4,3,1,2,0,9,7,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...
1093,2,0,2,3,10,6,3,1,1,1
1094,3,0,2,0,9,7,7,1,1,1
1095,3,0,2,3,9,7,1,0,1,1
1096,1,0,2,3,10,3,2,1,0,1


In [None]:
# データの前処理
exp = setup(
    data = df_train_oversampled, target='Survived', train_size=0.8, session_id=0
)

Unnamed: 0,Description,Value
0,Session id,0
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(1098, 10)"
4,Transformed data shape,"(1098, 10)"
5,Transformed train set shape,"(878, 10)"
6,Transformed test set shape,"(220, 10)"
7,Numeric features,9
8,Preprocess,True
9,Imputation type,simple


In [None]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.8314,0.877,0.8063,0.8493,0.8267,0.6628,0.6643,0.2
rf,Random Forest Classifier,0.8303,0.8889,0.8109,0.8442,0.8266,0.6605,0.6621,0.239
xgboost,Extreme Gradient Boosting,0.8302,0.8817,0.8017,0.8506,0.8248,0.6604,0.6622,0.093
dt,Decision Tree Classifier,0.8212,0.8478,0.7973,0.8381,0.8167,0.6424,0.6438,0.055
lightgbm,Light Gradient Boosting Machine,0.8211,0.8883,0.7925,0.8405,0.8154,0.6421,0.6437,0.734
gbc,Gradient Boosting Classifier,0.8189,0.8935,0.8016,0.8307,0.815,0.6377,0.6394,0.155
ada,Ada Boost Classifier,0.812,0.8759,0.8132,0.8118,0.8116,0.624,0.6253,0.134
qda,Quadratic Discriminant Analysis,0.8098,0.8503,0.8246,0.8026,0.8124,0.6197,0.6218,0.035
knn,K Neighbors Classifier,0.7904,0.8492,0.7928,0.7909,0.7904,0.5808,0.5828,0.088
lr,Logistic Regression,0.7813,0.8537,0.779,0.7829,0.78,0.5626,0.564,0.043


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [None]:
# モデルの作成
clf_pycaret = create_model(best_model, fold=10)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8636,0.8936,0.8182,0.9,0.8571,0.7273,0.7303
1,0.8636,0.9013,0.8409,0.881,0.8605,0.7273,0.728
2,0.7727,0.8432,0.7045,0.8158,0.7561,0.5455,0.5506
3,0.8068,0.8161,0.7955,0.814,0.8046,0.6136,0.6138
4,0.8068,0.8497,0.7727,0.8293,0.8,0.6136,0.6151
5,0.8636,0.9223,0.8636,0.8636,0.8636,0.7273,0.7273
6,0.8523,0.9032,0.7955,0.8974,0.8434,0.7045,0.7091
7,0.8523,0.9132,0.8409,0.8605,0.8506,0.7045,0.7047
8,0.8276,0.885,0.8409,0.8222,0.8315,0.655,0.6552
9,0.8046,0.8422,0.7907,0.8095,0.8,0.609,0.6092


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
# ハイパーパラメータの調整
tuned_model = tune_model(clf_pycaret, fold=10)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8295,0.9197,0.75,0.8919,0.8148,0.6591,0.6676
1,0.8068,0.8802,0.75,0.8462,0.7952,0.6136,0.6176
2,0.7727,0.7929,0.7273,0.8,0.7619,0.5455,0.5477
3,0.8295,0.8435,0.7727,0.8718,0.8193,0.6591,0.6634
4,0.7955,0.8306,0.7273,0.8421,0.7805,0.5909,0.5965
5,0.8523,0.9243,0.8182,0.878,0.8471,0.7045,0.7062
6,0.8295,0.8866,0.7727,0.8718,0.8193,0.6591,0.6634
7,0.8636,0.9166,0.8636,0.8636,0.8636,0.7273,0.7273
8,0.8276,0.8866,0.7955,0.8537,0.8235,0.6554,0.657
9,0.8276,0.8977,0.7442,0.8889,0.8101,0.6545,0.6632


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [None]:
# 8 : 2 で分割されていたテストデータへの適用
predict_model(tuned_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.8045,0.8402,0.7727,0.8252,0.7981,0.6091,0.6103


Unnamed: 0,Pclass,Sex,Embarked,FamilySize_bin,honorific,Cabin_ini,TicketFreq,Fare_bin,Age_bins,Survived,prediction_label,prediction_score
336,1,1,2,3,9,2,2,2,1,0,0,1.0000
340,2,1,2,3,6,5,3,1,0,1,1,1.0000
525,3,1,1,0,9,7,1,0,1,0,0,1.0000
867,1,1,2,0,9,0,1,1,1,0,0,0.7367
589,3,1,2,0,9,7,1,0,0,0,0,0.9032
...,...,...,...,...,...,...,...,...,...,...,...,...
372,3,1,2,0,9,7,1,0,1,0,0,0.8226
284,1,1,2,0,9,0,1,1,0,0,0,1.0000
1057,1,0,2,0,7,1,3,2,1,1,1,1.0000
256,1,0,0,0,10,7,2,2,0,1,1,0.9800


In [None]:
params = tuned_model.get_params()
params

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

In [None]:
# xとtの準備
x = df_train_oversampled.drop('Survived', axis = 1)
t = df_train_oversampled['Survived']
x_train_val, x_test, t_train_val, t_test = train_test_split(x, t, test_size = 0.3, random_state = 0)

In [None]:
# RFECVを用いて特徴量選定
rfc = ExtraTreesClassifier(**params)
rfecv = RFECV(estimator=rfc, step=1, cv=5, scoring='accuracy')
rfecv.fit(x_train_val, t_train_val)

# 選択された特徴量の列を抽出
selected_features = x_train_val.columns[rfecv.get_support()]

# 選択された特徴量で新しいデータセットを作成
x_train_selected = x_train_val[selected_features]
x_test_selected = x_test[selected_features]

# エクストラ何ちゃらを適用
rfc.fit(x_train_selected, t_train_val)
y_pred = rfc.predict(x_test_selected)

In [None]:
# 予測の評価
accuracy = accuracy_score(t_test, y_pred)
accuracy

0.8151515151515152

In [None]:
# 選択された特徴量の列を表示
selected_features

Index(['Pclass', 'Sex', 'honorific', 'TicketFreq'], dtype='object')

In [None]:
length = len(y_pred)
print(length)

330


In [None]:
# Kaggleに提出するファイルを作成
result = rfc.predict(df_test_comp[selected_features])
result

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [None]:
submit = pd.DataFrame(pd.read_csv('/content/test.csv')['PassengerId'])
submit['Survived'] = result
submit.to_csv('submission.csv', index=False)

In [None]:
submit

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [None]:
from google.colab import files
files.upload()  # kaggle.jsonファイルをアップロード
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

KeyboardInterrupt: 

In [None]:
!kaggle competitions submit -c titanic -f submission.csv -m "Message"

Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.10/dist-packages/kaggle/__init__.py", line 23, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.10/dist-packages/kaggle/api/kaggle_api_extended.py", line 403, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /root/.kaggle. Or use the environment method.


In [None]:
!python3 titanic.py

python3: can't open file '/content/titanic.py': [Errno 2] No such file or directory
