# Setup

In [11]:
#Common imports
import numpy as np
import os
import sys

#To make outputs more consistent
np.random.seed(42)

#To Save & Load Models
import pickle

#To plot figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as pt

pt.rcParams['axes.labelsize'] = 14
pt.rcParams['xtick.labelsize'] = 12
pt.rcParams['ytick.labelsize'] = 12

#Folder Directory Structure
PROJECT_ROOT_DIR ='.'
PROJECT_FOLDER = 'titanic_kaggle'
PROJECT_ID='end_to_end_project_titanic_kaggle'
PROJECT_OUTPUT_PATH = os.path.join(PROJECT_ROOT_DIR,'model',PROJECT_ID)


def save_fig(fig_id,tight_layout=True,fig_extension='png',resolution=300):
    if not os.path.exists(IMAGES_PATH):
        os.makedirs(PROJECT_OUTPUT_PATH)
    path = os.path.join(PROJECT_OUTPUT_PATH,fig_id + '.' + fig_extension)
    print("Saving Figure : {}".format(fig_id))
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path,format=fig_extension,dpi=resolution)
#Saving the Model
import pickle

def save_model(model,model_name):
    model_file = os.path.join(PROJECT_OUTPUT_PATH,model_name+'.pkl')
    with open(model_file, 'wb') as f:
        pickle.dump(model, f)
    
def load_model(model,model_name):
    model_file = os.path.join(PROJECT_OUTPUT_PATH,model_name+'.pkl')
    with open(model_file, 'rb') as f:
        model = pickle.load(f)
    return model

### Step 1 : Download the Data

In [13]:
import tarfile
from six.moves import urllib
from six.moves import urllib

#Server Location
DOWNLOAD_ROOT = "https://www.kaggle.com/c/3136/download/"
TITANIC_PATH = os.path.join("data", PROJECT_FOLDER)
TITANIC_GENDER_SUBMISSION = DOWNLOAD_ROOT + "gender_submission.csv"
TITANIC_TRAIN_SET = DOWNLOAD_ROOT + "train.csv"
TITANIC_TEST_SET = DOWNLOAD_ROOT + "test.csv"

def fetch_data(file='train.csv',url=TITANIC_TRAIN_SET,path=TITANIC_PATH,):
    print('URL to Download : {}'.format(url))
    print('Path to Download : {} '.format(path))
    if not os.path.isdir(path):
        os.makedirs(path)
    csv_path =  os.path.join(path,file)
    if not os.path.exists(csv_path):
        print('Downloading data ...')
        urllib.request.urlretrieve(url, csv_path)

#Downloading & Extract the data
fetch_data()

URL to Download : https://www.kaggle.com/c/3136/download/train.csv
Path to Download : data\titanic_kaggle 
Downloading data ...


### Step 2 - Load Data

In [17]:
import pandas as pd

def load_data(path=TITANIC_PATH,file='train.csv'):
    data_file = os.path.join(path,file)
    return pd.read_csv(data_file)

#Load data
print('Loading the Data...')
titanic = load_data()
#Verift the data
titanic.head(2)

Loading the Data...


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


### Step 3 - Train & Test Data Set

In [22]:
titanic_train = load_data(file='train.csv')
titanic_test = load_data(file='test.csv')
print('Titanic Training Data : {}'.format(titanic_train.shape))
print('Titanic Test Data : {}'.format(titanic_test.shape))

Titanic Training Data : (891, 12)
Titanic Test Data : (418, 11)


In [23]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [24]:
titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### Step 3 - Data Visualization & Inspection

### Step 5 - Data Pre-processing

In [25]:
#Lets check which rows have null Embarked column
titanic[titanic['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,
