In [21]:
import pandas as pd
import numpy as  np
import seaborn as sns
import matplotlib.pyplot as plt
% matplotlib inline

titanic_train = pd.read_csv('titanic_train.csv')
titanic_test = pd.read_csv('titanic_test.csv')


In [18]:
# データーの前処理

# label incodingでmaleを1にfemaleを0に取り換えて新しい項目Sex_encodedを作成する
from sklearn import preprocessing
input_labels = ['male', 'female']
encoder = preprocessing.LabelEncoder()
encoder.fit(input_labels)
encoded_values = encoder.transform(titanic_train['Sex'])
titanic_train['Sex_encoded'] = encoded_values

#EmbarkedのC, Q, Sをそれぞれ1,2,3にマッピングする
Embarked_mapping = {'C':1, 'Q':2, 'S':3}
titanic_train['Embarked'] = titanic_train['Embarked'].map(Embarked_mapping)

#Embarkedで欠損値を持っているのは２つのみなので、その２つを削除する。
titanic_train = titanic_train.dropna(subset=['Embarked'])

#Ageの欠損値を処理,回帰補完を行う
titanic_train = titanic_train.interpolate(method='linear')

#前処理したデータの内、データーが足りないCabinと乗客の生死を相関関係がほぼないName
#とTicketを除きデーターセットを準備する
X = titanic_train[['PassengerId','Pclass','Sex_encoded','Age', 
                   'SibSp','Parch','Fare','Embarked']]
y = titanic_train[['Survived']]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [19]:
#前処理が終わったデーターセットで線形回帰を行う
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

def adjusted(score, n_sample, n_features):
    adjusted_score = 1 - (1 - score) * ((n_sample - 1) / (n_sample - n_features - 1))
    return adjusted_score

print('adjusted R^2')
print('train : %3f' % adjusted(lr.score(X_train, y_train), len(y_train),2))
print('test : %3f' % adjusted(lr.score(X_test, y_test), len(y_test),2))

adjusted R^2
train : 0.416215
test : 0.328656


In [39]:
#Ridge回帰を行う
from sklearn.linear_model import Ridge
model_ridge = Ridge(alpha=10) 
model_ridge.fit(X_train, y_train)

def adjusted(score, n_sample, n_features):
    adjusted_score = 1 - (1 - score) * ((n_sample - 1) / (n_sample - n_features - 1))
    return adjusted_score

print('adjusted R^2')
print('train : %3f' % adjusted(model_ridge.score(X_train, y_train), len(y_train),2))
print('test : %3f' % adjusted(model_ridge.score(X_test, y_test), len(y_test),2))

adjusted R^2
train : 0.414858
test : 0.331733


In [36]:
#Elastic Netモデルを使う
from sklearn.linear_model import ElasticNet
model_en= ElasticNet(alpha=0.001, l1_ratio=0.9) 
model_en.fit(X_train, y_train)

def adjusted(score, n_sample, n_features):
    adjusted_score = 1 - (1 - score) * ((n_sample - 1) / (n_sample - n_features - 1))
    return adjusted_score

print('adjusted R^2')
print('train : %3f' % adjusted(model_en.score(X_train, y_train), len(y_train),2))
print('test : %3f' % adjusted(model_en.score(X_test, y_test), len(y_test),2))

adjusted R^2
train : 0.416177
test : 0.329647
