In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
import numpy as np
import pandas as pd
import io
import requests
import math
from scipy import stats

In [2]:
def feature_normalize(dataset):
    mu = np.mean(dataset,axis=0)
    sigma = np.std(dataset,axis=0)
    
    print(df.head(3))
    return (dataset - mu)/sigma

In [3]:
def str_to_int(df):
    str_columns = df.select_dtypes(['object']).columns
    print(str_columns)
    for col in str_columns:
        df[col] = df[col].astype('category')

    cat_columns = df.select_dtypes(['category']).columns
    df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
    return df

In [4]:
def count_space_except_nan(x):
    if isinstance(x,str):
        return x.count(" ") + 1
    else :
        return 0

In [5]:
def one_hot(df, cols):
    """
    @param df pandas DataFrame
    @param cols a list of columns to encode 
    @return a DataFrame with one-hot encoding
    """
    for each in cols:
        dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
        del df[each]
        df = pd.concat([df, dummies], axis=1)
        
    print(df.head())
    return df

In [13]:
df_train = pd.read_csv('./titanic/train.csv')

In [14]:
df_train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [15]:
df_test = pd.read_csv('./titanic/test.csv')

In [16]:
df_test.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [17]:
print (df_train.isnull().sum())
delete_columns = ["Ticket", "Name", "PassengerId", "Cabin", "Embarked"]

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [18]:
def pre_processing(df):
    df.drop(delete_columns, axis=1, inplace=True)
    # Count room nubmer
    # df_train["Cabin"] = df_train["Cabin"].apply(count_space_except_nan)
    # Replace NaN with mean value
    df["Age"].fillna(df["Age"].mean(), inplace=True)
    # Pclass, Embarked one-hot
    df = one_hot(df, df.loc[:, ["Pclass"]].columns)
    # String to int
    df = str_to_int(df)
    # Age Normalization
    df["Age"] = feature_normalize(df["Age"])
    stats.describe(df).variance
    return df

In [19]:
df_train = pre_processing(df_train)
#save PassengerId for evaluation
test_passenger_id = df_test["PassengerId"]
df_test = pre_processing(df_test)

   Survived     Sex   Age  SibSp  Parch     Fare  Pclass_1  Pclass_2  Pclass_3
0         0    male  22.0      1      0   7.2500         0         0         1
1         1  female  38.0      1      0  71.2833         1         0         0
2         1  female  26.0      0      0   7.9250         0         0         1
3         1  female  35.0      1      0  53.1000         1         0         0
4         0    male  35.0      0      0   8.0500         0         0         1
Index(['Sex'], dtype='object')
      Sex   Age  SibSp  Parch     Fare  Pclass_1  Pclass_2  Pclass_3
0    male  34.5      0      0   7.8292         0         0         1
1  female  47.0      1      0   7.0000         0         0         1
2    male  62.0      0      0   9.6875         0         1         0
3    male  27.0      0      0   8.6625         0         0         1
4  female  22.0      1      1  12.2875         0         0         1
Index(['Sex'], dtype='object')


In [20]:
features = df_train.iloc[:, 1:].values
# features = feature_normalize(features)
labels = df_train.iloc[:, :1].values
print(features.shape, labels.shape)
stats.describe(features).variance

(891, 8) (891, 1)


array([2.28474508e-01, 1.00112360e+00, 1.21604308e+00, 6.49728244e-01,
       2.46943685e+03, 1.83861083e-01, 1.64047466e-01, 2.47670210e-01])

In [21]:
stats.describe(features).mean

array([6.47586981e-01, 2.23290646e-16, 5.23007856e-01, 3.81593715e-01,
       3.22042080e+01, 2.42424242e-01, 2.06509540e-01, 5.51066218e-01])

In [25]:
real_test_x = df_test.values  # Why numpy array??
print(real_test_x.shape)

(418, 8)


In [27]:
type(real_test_x)

numpy.ndarray

In [28]:
real_test_x.shape

(418, 8)

In [29]:
real_test_x[0]

array([1.       , 0.3349926, 0.       , 0.       , 7.8292   , 0.       ,
       0.       , 1.       ])