# Basic Overview
This is a stab to see if we can build a third uncorrelated model using the support vector classification on the ticket field.

The idea here is as follows :
1. Generate survived or not prediction by doing SVC on tickets field.
2. Use this prediction as another factor and in other factors. This way , we hope to improve upon a base model using just tickets alone.


Comments/criticisms/appreciations are greatly accepted and appreciated. Do not be shy and send me an email at babinu@gmail.com !

Source of data : https://www.kaggle.com/c/titanic/data

In [260]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [261]:
train_data = pd.read_csv("../input/train.csv")
test_data = pd.read_csv("../input/test_data_processed_correct.csv")

In [262]:
train_data['Family_Size'] = train_data['SibSp'] + train_data['Parch'] + 1


#### Pipeline for SVC


In [263]:
# We use a pipeline to make things easire
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
best_model_svc = Pipeline([('vect', TfidfVectorizer()),
                           ('transformer', TfidfTransformer(use_idf=False)),
                           ('classify', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, 
                                                      random_state=0, max_iter=5, tol=None))])

Comment : See if we combine this with other predictors and get a more refined indicator.

#### Pipeline for xgboost(which is getting the output of svc)

In [264]:
from sklearn.preprocessing import Imputer
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import xgboost as xgb

In [265]:
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)


In [266]:
def get_train_test_data(train_data,
                        train_test_X_columns=['Family_Size'], 
                        train_test_Y_columns=['Survived']):

    # Simple training and testing
    print("TEST", train_test_X_columns)
    rel_cols = train_test_X_columns
    if 'Ticket' not in train_test_X_columns:
        rel_cols.append('Ticket')
    X = train_data[rel_cols]
    Y = train_data[train_test_Y_columns]


    # Do imputation on relevant columns.
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0)
    X_train = X_train.copy()
    X_test = X_test.copy()
    return (X_train, X_test, Y_train, Y_test, X, Y)


In [286]:
def create_pipeline_and_out_of_sample_score(X_train, X_test, Y_train, Y_test, n_estimators=100, 
                                            learning_rate=0.1): 
    # Rewrite everything as a pipeline
    print("TESTc", X_train.columns)
    best_model_svc = Pipeline([('vect', TfidfVectorizer()),
                               ('transformer', TfidfTransformer(use_idf=False)),
                               ('classify', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, 
                                                          random_state=0, max_iter=5, tol=None))])    
    best_model_svc.fit(X_train['Ticket'], Y_train.values.ravel())
    predictions = best_model_svc.predict(X_train['Ticket'])
    X_train['Ticket_Survived'] = predictions
    my_pipeline = make_pipeline(Imputer(), 
                                XGBClassifier(seed=1, n_estimators=n_estimators, learning_rate=learning_rate))
    rel_cols = [col for col in X_train.columns if col not in ['Ticket']]
    print("TESTc2", rel_cols)
    my_pipeline.fit(X_train[rel_cols], Y_train.values.ravel())
    predictions_test = best_model_svc.predict(X_test['Ticket'])
    X_test['Ticket_Survived'] = predictions_test
    predictions = my_pipeline.predict(X_test[rel_cols])
    out_of_sample_score = 1.0 - mean_absolute_error(predictions, Y_test)
    return (my_pipeline, out_of_sample_score, predictions)

In [279]:
def train_test(train_data, 
               train_X_columns=['Family_Size'], 
               train_Y_columns=['Survived']):

    rel_cols = train_X_columns.copy()
    if 'Ticket' not in rel_cols:
        rel_cols.append('Ticket')
    print(rel_cols)
    print(train_X_columns)
    (X_train, X_test, Y_train, Y_test, X, Y) = get_train_test_data(train_data, rel_cols, train_Y_columns)

    my_pipeline, out_of_sample_score, predictions_test = \
        create_pipeline_and_out_of_sample_score(X_train, X_test, Y_train, Y_test)
    print("Number of entries in training set is {0}".format(len(X_train)))
    rel_cols_predict = train_X_columns.copy()
    rel_cols_predict.append('Ticket_Survived')
    print(rel_cols_predict)
    predictions_train = my_pipeline.predict(X_train[rel_cols_predict])
    num_correct_predictions_train = int((1.0 - mean_absolute_error(predictions_train, Y_train)) * len(Y_train))


    print("Number of correct predictions in training set is {0}".format(num_correct_predictions_train))    
    print("Number of entries in test set is {0}".format(len(X_test)))    
    print("Number of correct predictions in test set is {0}".format(int(out_of_sample_score * len(X_test))))


#### Performance, by making some port of input data as a test set

In [None]:
train_data['Sex_new'] = 

In [297]:
train_test(train_data, ['Fare'])

['Fare', 'Ticket']
['Fare']
TEST ['Fare', 'Ticket']
TESTc Index(['Fare', 'Ticket'], dtype='object')
TESTc2 ['Fare', 'Ticket_Survived']
Number of entries in training set is 668
['Fare', 'Ticket_Survived']
Number of correct predictions in training set is 636
Number of entries in test set is 223
Number of correct predictions in test set is 161


Comment : Performance looks reasonable, though not very great. Let us see how things look on out of sample data

In [None]:
test_data['Family_Size'] = test_data['SibSp'] + test_data['Parch'] + 1
test_data['Ticket_Survived'] = -1

In [294]:
X_train = train_data[['Fare', 'Ticket']].copy()
Y_train = train_data[['Survived']].copy()
X_test = test_data[['Fare', 'Ticket']].copy()
Y_test = test_data[['Survived']].copy()

create_pipeline_and_out_of_sample_score(X_train, X_test, Y_train, Y_test)

TESTc Index(['Fare', 'Ticket'], dtype='object')
TESTc2 ['Fare', 'Ticket_Survived']


(Pipeline(memory=None,
      steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('xgbclassifier', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
        max_depth=3, min_child_weight=1, mi...=0,
        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=1, silent=True,
        subsample=1))]),
 0.69377990430622,
 array([0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
        1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        1, 1, 0, 0, 0, 0