In [1]:
import numpy as np
import pandas as pd

train = pd.read_csv('train.csv', dtype={'Age': np.float64}, )
test = pd.read_csv('test.csv', dtype={'Age': np.float64}, )

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
# 相関を調べる
train_corr = train.corr()
train_corr

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [3]:
# データ前処理
# def correct_data(titanic_data):
#     titanic_data.Age = titanic_data.Age.fillna(titanic_data.Age.median())
#     titanic_data.Sex = titanic_data.Sex.replace(['male', 'female'], [0, 1])
#     titanic_data.Embarked = titanic_data.Embarked.fillna('S')
#     titanic_data.Embarked = titanic_data.Embarked.replace(['C', 'S', 'Q'], [0, 1, 2])
#     titanic_data.Fare = titanic_data.Fare.fillna(titanic_data.Fare.median())
#     return titanic_data

# train_data = correct_data(train)
# test_data = correct_data(test)

# train_corr = train.corr()
# train_corr

def correct_data(train_data, test_data):
    train_data.Age = train_data.Age.fillna(test_data.Age.median())
    train_data.Fare = train_data.Fare.fillna(test_data.Fare.median())

    test_data.Age = test_data.Age.fillna(test_data.Age.median())
    test_data.Fare = test_data.Fare.fillna(test_data.Fare.median())
    
    train_data = correct_data_common(train_data)
    test_data = correct_data_common(test_data)
    
    return train_data, test_data
    
def correct_data_common(titanic_data):
    titanic_data.Sex = titanic_data.Sex.replace(['male', 'female'], [0, 1])
    titanic_data.Embarked = titanic_data.Embarked.fillna('S')
    titanic_data.Embarked = titanic_data.Embarked.replace(['C', 'S', 'Q'], [0, 1, 2])
    
    return titanic_data

train_data, test_data = correct_data(train, test)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,1.0,-0.005007,-0.035144,-0.042939,0.034212,-0.057527,-0.001652,0.012658,-0.017443
Survived,-0.005007,1.0,-0.338481,0.543351,-0.06491,-0.035322,0.081629,0.257307,-0.125953
Pclass,-0.035144,-0.338481,1.0,-0.1319,-0.339898,0.083081,0.018443,-0.5495,0.305762
Sex,-0.042939,0.543351,-0.1319,1.0,-0.081163,0.114631,0.245489,0.182333,-0.022521
Age,0.034212,-0.06491,-0.339898,-0.081163,1.0,-0.233296,-0.172482,0.096688,-0.040166
SibSp,-0.057527,-0.035322,0.083081,0.114631,-0.233296,1.0,0.414838,0.159651,0.030874
Parch,-0.001652,0.081629,0.018443,0.245489,-0.172482,0.414838,1.0,0.216225,-0.035957
Fare,0.012658,0.257307,-0.5495,0.182333,0.096688,0.159651,0.216225,1.0,-0.268865
Embarked,-0.017443,-0.125953,0.305762,-0.022521,-0.040166,0.030874,-0.035957,-0.268865,1.0


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_score

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

models = []

models.append(("LogisticRegression",LogisticRegression()))
models.append(("SVC",SVC()))
models.append(("LinearSVC",LinearSVC()))
models.append(("KNeighbors",KNeighborsClassifier()))
models.append(("DecisionTree",DecisionTreeClassifier()))
models.append(("RandomForest",RandomForestClassifier()))
models.append(("MLPClassifier",MLPClassifier(solver='lbfgs', random_state=0)))

In [6]:
# 交差検証
results = []
names = []
for name, model in models:
    result = cross_val_score(model, train_data[predictors], train_data['Survived'], cv=3)
    names.append(name)
    results.append(result)

In [8]:
for i in range(len(names)):
    print(names[i], results[i].mean())

LogisticRegression 0.785634118967
SVC 0.687991021324
LinearSVC 0.700336700337
KNeighbors 0.701459034792
DecisionTree 0.765432098765
RandomForest 0.787878787879
MLPClassifier 0.773288439955


In [10]:
alg = RandomForestClassifier()
alg.fit(train_data[predictors], train_data['Survived'])

predictions = alg.predict(test_data[predictors])

submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': predictions
})

submission.to_csv('submission.csv', index=False)