In [229]:
import pandas as pd
import numpy as  np
import seaborn as sns
import matplotlib.pyplot as plt
% matplotlib inline

titanic_train = pd.read_csv('titanic_train.csv')
titanic_test = pd.read_csv('titanic_test.csv')

In [211]:
print(titanic_train[:10])

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   
5            6         0       3   
6            7         0       1   
7            8         0       3   
8            9         1       3   
9           10         1       2   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   
5                                   Moran, Mr. James    male   NaN      0   
6                            McCarthy, Mr. Timothy J    male  54

In [212]:
print(titanic_test[:10])

   PassengerId  Pclass                                          Name     Sex  \
0          892       3                              Kelly, Mr. James    male   
1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   
2          894       2                     Myles, Mr. Thomas Francis    male   
3          895       3                              Wirz, Mr. Albert    male   
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   
5          897       3                    Svensson, Mr. Johan Cervin    male   
6          898       3                          Connolly, Miss. Kate  female   
7          899       2                  Caldwell, Mr. Albert Francis    male   
8          900       3     Abrahim, Mrs. Joseph (Sophie Halaut Easu)  female   
9          901       3                       Davies, Mr. John Samuel    male   

    Age  SibSp  Parch     Ticket     Fare Cabin Embarked  
0  34.5      0      0     330911   7.8292   NaN        Q  
1

In [197]:
# titanic_testのデータには乗客の生死のデータが含まれていないので、titanic_trainのデータだけを用いモデルを
# 作り、titanic_testのデータをそのモデルに入りその乗客の生死を推測して見る

In [213]:
titanic_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [214]:
#回帰が出来ない項目を除きとりあえずの線形回帰を行なった。

X = titanic_train[['PassengerId', 
                   'Pclass', 
                   #'Name', 
                   #'Sex', 
                   #'Age', 
                   'SibSp', 
                   'Parch', 
                   #'Ticket', 
                   'Fare', 
                   #'Cabin', 
                   #'Embarked'
                  ]]
y = titanic_train[['Survived']]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

def adjusted(score, n_sample, n_features):
    adjusted_score = 1 - (1 - score) * ((n_sample - 1) / (n_sample - n_features - 1))
    return adjusted_score

print('adjusted R^2')
print('train : %3f' % adjusted(lr.score(X_train, y_train), len(y_train),2))
print('test : %3f' % adjusted(lr.score(X_test, y_test), len(y_test),2))

adjusted R^2
train : 0.121226
test : 0.133189


In [215]:
# label incoding
# maleを1にfemaleを0に取り換えて新しい項目Sex_encodedを作成する
from sklearn import preprocessing
input_labels = ['male', 'female']

encoder = preprocessing.LabelEncoder()
encoder.fit(input_labels)

encoded_values = encoder.transform(titanic_train['Sex'])

titanic_train['Sex_encoded'] = encoded_values
print(titanic_train[:10][['Sex' ,'Sex_encoded']])

      Sex  Sex_encoded
0    male            1
1  female            0
2  female            0
3  female            0
4    male            1
5    male            1
6    male            1
7    male            1
8  female            0
9  female            0


In [221]:
# Sexの代わりにSex_encodedを用いモデルを作成し線形回帰を行うと次のようになる

X = titanic_train[['PassengerId', 
                   'Pclass', 
                   #'Name', 
                   'Sex_encoded', 
                   #'Age', 
                   'SibSp', 
                   'Parch', 
                   #'Ticket', 
                   'Fare', 
                   #'Cabin', 
                   #'Embarked'
                  ]]
y = titanic_train[['Survived']]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

def adjusted(score, n_sample, n_features):
    adjusted_score = 1 - (1 - score) * ((n_sample - 1) / (n_sample - n_features - 1))
    return adjusted_score

print('adjusted R^2')
print('train : %3f' % adjusted(lr.score(X_train, y_train), len(y_train),2))
print('test : %3f' % adjusted(lr.score(X_test, y_test), len(y_test),2))

adjusted R^2
train : 0.370425
test : 0.371552


In [224]:
#欠損値を処理
titanic_train.isnull().sum()


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [230]:
#Embarkedで欠損値を持っているのは２つのみなので、その２つを削除する。
Embarked_mapping = {'C':1, 'Q':2, 'S':3}
titanic_train['Embarked'] = titanic_train['Embarked'].map(Embarked_mapping)
titanic_train['Embarked']

0      3.0
1      1.0
2      3.0
3      3.0
4      3.0
5      2.0
6      3.0
7      3.0
8      3.0
9      1.0
10     3.0
11     3.0
12     3.0
13     3.0
14     3.0
15     3.0
16     2.0
17     3.0
18     3.0
19     1.0
20     3.0
21     3.0
22     2.0
23     3.0
24     3.0
25     3.0
26     1.0
27     3.0
28     2.0
29     3.0
      ... 
861    3.0
862    3.0
863    3.0
864    3.0
865    3.0
866    1.0
867    3.0
868    3.0
869    3.0
870    3.0
871    3.0
872    3.0
873    3.0
874    1.0
875    1.0
876    3.0
877    3.0
878    3.0
879    1.0
880    3.0
881    3.0
882    3.0
883    3.0
884    3.0
885    2.0
886    3.0
887    3.0
888    3.0
889    1.0
890    2.0
Name: Embarked, Length: 891, dtype: float64