# Serialize Data

This notebook does not do much. It just has all the steps of Week2 and saves the traindata and test data in one place. This is very useful if you want to run several experments with multiple classifiers which use the same data.
It helps not repeating redundant operations with feature handling. 

It would be useful to customize this. In order to do so keep note of the following :-
* Modify which features to "categorize"
* Modify which features to drop
* Modify how to input missing data
* Modify how to quantify the categorical values
* Convert this into a module (maybe *datasethandler*?) which can :-
    * Accept global parameters
    * Cache data if the sub-dataset for the given parameters has already been computed
    * [Optional] Paralellize
    * [Optional] Make it dataset agnostic, so it will be able to download different kinds of datasets.
    * _A dataset loader made in the above way is very useful to run hyperparameter experiments_

In [2]:
import sys
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import matplotlib.pyplot as plt
import seaborn as sns
import sys


#Ignore Warnings - save some confusion
import warnings
warnings.filterwarnings('ignore')

#Pandas more columns
pd.options.display.max_columns = None
pd.set_option('display.max_columns', None)

# Add input as import path
sys.path.insert(0,'../input')

# Plot style
plt.style.use('fivethirtyeight')

# Import the data from the dataset
train_data = pd.read_csv('../input/train.csv',index_col='id')
test_data = pd.read_csv('../input/test.csv',index_col='id')

def simplify_fares(df):
    df.fare = df.fare.fillna(-0.5)
    bins = (-1, 0, 8, 15, 31, 1000)
    group_names = ['Unknown', '1_quartile', '2_quartile', '3_quartile', '4_quartile']
    categories = pd.cut(df.fare, bins, labels=group_names)
    df.fare = categories
    return df

def simplify_cabins(df):
    df.cabin = df.cabin.fillna('N')
    df.cabin = df.cabin.apply(lambda x: x[0])
    return df


def format_name(df):
    df['lname'] = df.name.apply(lambda x: x.split(' ')[0])
    df['lname'].fillna(' ')
    df['nameprefix'] = df.name.apply(lambda x: x.split(' ')[1])
    df['nameprefix'].fillna(' ')
    return df


def drop_features(df):
    return df.drop(['ticket', 'name', 'embarked', 'home.dest', 'body', 'boat'], axis=1)

def transform_features(df):
    df = simplify_fares(df)
    df = simplify_cabins(df)
    df = format_name(df)
    df = drop_features(df)
    return df


train_data = transform_features(train_data)
test_data  = transform_features(test_data)

from sklearn import preprocessing
def encode_features(df_train, df_test):
    features = ['fare', 'cabin', 'sex', 'lname', 'nameprefix']
    df_combined = pd.concat([df_train[features], df_test[features]])

    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df_combined[feature])
        df_train[feature] = le.transform(df_train[feature])
        df_test[feature] = le.transform(df_test[feature])
    return df_train, df_test

train_data, test_data = encode_features(train_data, test_data)

def fill_missing_data(df_train,df_test):
    features = ['age']
    df_combined = pd.concat([df_train[features], df_test[features]])
    df_imputer = preprocessing.Imputer()
    df_imputer.fit(df_combined[features])
    df_train[features] = df_imputer.transform(df_train[features])
    df_test[features] = df_imputer.transform(df_test[features])
    return df_train, df_test

train_data,test_data = fill_missing_data(train_data,test_data)

def get_X_Y_pair(df):
    features = df.columns.values
    x_features = [f for f in features if f!='survived']
    return df[x_features], df['survived']

def scale_data(df_train, df_test):
    df_combine = pd.concat([df_train, df_test])
    features = df_train.columns.values
    scaler = preprocessing.StandardScaler()
    scaler.fit(df_combine)
    return scaler.transform(df_train), scaler.transform(df_test)

x_train, y_train = get_X_Y_pair(train_data)
x_test, y_test = get_X_Y_pair(test_data)

#not pandas after this
x_train, x_test = scale_data(x_train,x_test)

import joblib
joblib.dump((x_train,y_train),"traindata.pkl")
joblib.dump((x_test, y_test), "testdata.pkl")

#import _pickle as pkl
#pkl.dumps((x_test, y_test), open("testdata.pkl","w"))
#pkl.dumps((x_train, y_train), open("traindata.pkl","w"))


['testdata.pkl']