In [1]:
# 各種インポート
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
# trainの確認
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [5]:
# testの確認
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [7]:
# 不要な列をdrop
train = train.drop(['PassengerId', 'Name', 'Cabin'], axis=1)
test = test.drop(['Name', 'Cabin'], axis=1)

In [8]:
# dropされているか確認
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,0,3,male,22.0,1,0,A/5 21171,7.25,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,S
3,1,1,female,35.0,1,0,113803,53.1,S
4,0,3,male,35.0,0,0,373450,8.05,S


In [9]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,892,3,male,34.5,0,0,330911,7.8292,Q
1,893,3,female,47.0,1,0,363272,7.0,S
2,894,2,male,62.0,0,0,240276,9.6875,Q
3,895,3,male,27.0,0,0,315154,8.6625,S
4,896,3,female,22.0,1,1,3101298,12.2875,S


In [10]:
# trainの欠損値の確認
train.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Embarked      2
dtype: int64

In [11]:
# testの欠損値の確認
test.isnull().sum()

PassengerId     0
Pclass          0
Sex             0
Age            86
SibSp           0
Parch           0
Ticket          0
Fare            1
Embarked        0
dtype: int64

In [12]:
# trainのEmbarkedのデータを調べる
pd.value_counts(train.Embarked)

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [13]:
# 欠損をSで埋める
_ = train.fillna({'Embarked': 'S'}, inplace=True)

In [14]:
# testのFareを平均値で埋める
test_fare_mean = test['Fare'].mean()
_ = test.fillna({'Fare': test_fare_mean}, inplace=True)

In [15]:
# Embarkedとsexが文字列なのでLabelEncoderで数値に置き換える
LE = LabelEncoder()
labels = ['Embarked', 'Sex', 'Ticket']
for label in labels:
    train[label] = LE.fit_transform(train[label])
    test[label] = LE.fit_transform(test[label])

In [16]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,0,3,1,22.0,1,0,A/5 21171,7.25,2
1,1,1,0,38.0,1,0,PC 17599,71.2833,0
2,1,3,0,26.0,0,0,STON/O2. 3101282,7.925,2
3,1,1,0,35.0,1,0,113803,53.1,2
4,0,3,1,35.0,0,0,373450,8.05,2


In [17]:
train.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Embarked      0
dtype: int64

In [18]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,892,3,1,34.5,0,0,330911,7.8292,1
1,893,3,0,47.0,1,0,363272,7.0,2
2,894,2,1,62.0,0,0,240276,9.6875,1
3,895,3,1,27.0,0,0,315154,8.6625,2
4,896,3,0,22.0,1,1,3101298,12.2875,2


In [19]:
test.isnull().sum()

PassengerId     0
Pclass          0
Sex             0
Age            86
SibSp           0
Parch           0
Ticket          0
Fare            0
Embarked        0
dtype: int64

In [20]:
# Ageを機械学習ライブラリを使って埋める
def fill_missing_age(df):
    # 使う特徴
    age_df = df[['Age', 'Embarked', 'Fare', 'Parch', 'SibSp', 'Pclass']]
    # Ageが欠損値があるグループとないグループに分ける
    train = age_df.loc[(df.Age.notnull())] # 欠損なし
    test = age_df.loc[(df.Age.isnull())] # 欠損あり
    # Ageの部分
    y = train.values[:, 0]
    # Age以外の部分を特徴量として扱う
    X = train.values[:, 1::]
    # モデル作成
    rtr = RandomForestRegressor(n_estimators=2000, n_jobs=-1)
    rtr.fit(X, y)
    # モデル適用
    predicted_ages = rtr.predict(test.values[:, 1::])
    # 元のデータフレームに予測されたAgeを返す
    df.loc[(df.Age.isnull()), 'Age'] = predicted_ages
    
    return df

In [21]:
# 実際のデータセットに適用
train = fill_missing_age(train)
test = fill_missing_age(test)

In [22]:
# AgeとFareは値の幅が広いので、結果に以上に大きく反映されやすい
# StandardScalerを用いて、他の特徴量と平等の重み付けをする
std_scale = StandardScaler().fit(train[['Age', 'Fare']])
train[['Age', 'Fare']] = std_scale.transform(train[['Age', 'Fare']])

std_scale = StandardScaler().fit(test[['Age', 'Fare']])
test[['Age', 'Fare']] = std_scale.transform(test[['Age', 'Fare']])

In [23]:
# 機械学習で用いるデータとセットの分ける
X_train = train.drop('Survived', axis=1)
y_train = train['Survived']
X_test = test.drop('PassengerId', axis=1).copy()
X_train.shape, y_train.shape, X_test.shape

((891, 8), (891,), (418, 8))

In [24]:
# 機械学習モデルをリストに格納
models = []
models.append(("KNC",KNeighborsClassifier()))
models.append(("DTC",DecisionTreeClassifier()))
models.append(("SVM",SVC()))
models.append(("AdaBoost",AdaBoostClassifier()))
models.append(("GradientBoosting",GradientBoostingClassifier()))

In [25]:
# 複数のClassifierの適用
results = []
names = []
for name,model in models:
    kfold = KFold(n_splits=10, random_state=42)
    result = cross_val_score(model,X_train,y_train, cv = kfold, scoring = "accuracy")
    names.append(name)
    results.append(result)

ValueError: could not convert string to float: 'W./C. 6607'

In [27]:
train.dtypes

Survived      int64
Pclass        int64
Sex           int64
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Embarked      int64
dtype: object