In [1]:
import numpy as np
import pandas as pd
import pylab as P
import matplotlib.pyplot as plt
import csv as csv

In [2]:
# Pandas DataFrame
df_train = pd.read_csv('../Data Files/train.csv')
df_test = pd.read_csv('../Data Files/test.csv')
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Check dataframe info: some missed values in "Age", and lots of missed values in "Cabin"
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 66.2+ KB


In [4]:
# Turn "Sex" column categories to numbers
from sklearn.preprocessing import LabelEncoder

Sex_le = LabelEncoder()
df_train['Sex'] = Sex_le.fit_transform(df_train['Sex'])
df_test['Sex'] = Sex_le.fit_transform(df_test['Sex'])
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


In [5]:
# Turn "Embarked" column categories to dummy features (one-hot encoding)

df_train = pd.concat([df_train, pd.get_dummies(df_train[['Embarked']])], axis=1)
df_test = pd.concat([df_test, pd.get_dummies(df_test[['Embarked']])], axis=1)
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S,0.0,0.0,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C,1.0,0.0,0.0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S,0.0,0.0,1.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S,0.0,0.0,1.0
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S,0.0,0.0,1.0


In [6]:
# Fill missing value of Age in both train and test dataset
median_ages = np.zeros((2, 3))

for i in range(0, 2):
    for j in range(0, 3):
        median_ages[i, j] = df_train[(df_train['Sex'] == i) &
                                     (df_train['Pclass'] == j + 1)]['Age'].dropna().median()

df_train['AgeFill'] = df_train['Age']
df_test['AgeFill'] = df_test['Age']

for i in range(0, 2):
    for j in range(0, 3):
        df_train.loc[(df_train.Age.isnull()) & (df_train.Sex == i) & (
            df_train.Pclass == j + 1), 'AgeFill'] = median_ages[i, j]
        df_test.loc[(df_test.Age.isnull()) & (df_test.Sex == i) & (
            df_test.Pclass == j + 1), 'AgeFill'] = median_ages[i, j]

df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null int32
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
Embarked_C     891 non-null float64
Embarked_Q     891 non-null float64
Embarked_S     891 non-null float64
AgeFill        891 non-null float64
dtypes: float64(6), int32(1), int64(5), object(4)
memory usage: 94.0+ KB


In [7]:
# Fill missing value of Fare in test dataset
df_test.Fare[df_test.Fare.isnull()] = df_train.Fare.median()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [8]:
# Create "Age*Class" column
df_train['Age*Class'] = df_train.AgeFill * df_train.Pclass
df_test['Age*Class'] = df_test.AgeFill * df_test.Pclass

In [9]:
# Collect the test data's PassengerIds before dropping it
test_ids = df_test['PassengerId'].values

In [10]:
# Drop object data
df_train = df_train.drop(
    ['Name', 'Ticket', 'Cabin', 'Embarked', 'Age', 'PassengerId'], axis=1)
df_test = df_test.drop(['Name', 'Ticket', 'Cabin',
                        'Embarked', 'Age', 'PassengerId'], axis=1)
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,AgeFill,Age*Class
0,0,3,1,1,0,7.25,0.0,0.0,1.0,22.0,66.0
1,1,1,0,1,0,71.2833,1.0,0.0,0.0,38.0,38.0
2,1,3,0,0,0,7.925,0.0,0.0,1.0,26.0,78.0
3,1,1,0,1,0,53.1,0.0,0.0,1.0,35.0,35.0
4,0,3,1,0,0,8.05,0.0,0.0,1.0,35.0,105.0


In [11]:
# From given train dataset split train and test
from sklearn.cross_validation import train_test_split

X = df_train.iloc[:, 1:].values
y = df_train.iloc[:, 0].values
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.30, random_state=1)

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('clf', LogisticRegression(random_state=1))])

pipe_lr.fit(X_train, y_train)
print('Test Accuracy: %.3f' % pipe_lr.score(X_test, y_test))
y_pred = pipe_lr.predict(X_test)

Test Accuracy: 0.772


In [13]:
# Use the whole train data to predict

test_data = df_test.values
pipe_lr.fit(X, y)
test_pred = pipe_lr.predict(test_data)

In [14]:
predictions_file = open("../submissions/LRModel-20160607.csv", 'w', newline='')
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId", "Survived"])
open_file_object.writerows(zip(test_ids, test_pred))
predictions_file.close()