# Basic Overview

The idea here is to implement SVC algorithm on tickets field and see if that helps us in prediction survivals accurately.

Comments/criticisms/appreciations are greatly accepted and appreciated.

Source of data : https://www.kaggle.com/c/titanic/data

In [43]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV

In [44]:
# Make sure that unnecessary  warnings are avoided.
# Thanks to https://stackoverflow.com/questions/49545947/sklearn-deprecationwarning-truth-value-of-an-array
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)


In [45]:
# Ha 
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [46]:
train_data = pd.read_csv("../input/train.csv")
test_data = pd.read_csv("../input/test.csv")

### Using SVC

In [47]:
# We use a pipeline to make things easire
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
svc_clf = Pipeline([('vect', TfidfVectorizer()),
                    ('transformer', TfidfTransformer()),
                    ('classify', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, 
                                               random_state=0, max_iter=5, tol=None))])

#### Training on some portion of input data and using the other portion as a test set

In [48]:
svc_clf.fit(train_data['Ticket'][:700], train_data['Survived'][:700])

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...lty='l2', power_t=0.5, random_state=0, shuffle=True,
       tol=None, verbose=0, warm_start=False))])

In [49]:
predictions = svc_clf.predict(train_data['Ticket'][700:]) 

In [50]:
survived_or_not = train_data['Survived'][700:]

In [51]:
np.mean(predictions == survived_or_not)

0.7329842931937173

Comment : The results look encouraging. Let us find optimal parameters using GridSearch

### Finding optimal parameters using GridSearch

In [52]:
parameters = {'vect__ngram_range' : [(1, 1), (2, 2), (3 , 3)],
              'transformer__use_idf' : (True, False),
              'classify__alpha' : (1e-2, 1e-3),
              }

In [53]:
gs_clf = GridSearchCV(svc_clf, parameters, n_jobs=-1)

In [54]:
gs_clf.fit(train_data['Ticket'], train_data['Survived'])

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...lty='l2', power_t=0.5, random_state=0, shuffle=True,
       tol=None, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vect__ngram_range': [(1, 1), (2, 2), (3, 3)], 'transformer__use_idf': (True, False), 'classify__alpha': (0.01, 0.001)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [55]:
cv_result = pd.DataFrame(gs_clf.cv_results_)

In [56]:
gs_clf.best_params_

{'classify__alpha': 0.001,
 'transformer__use_idf': True,
 'vect__ngram_range': (1, 1)}


### Training the best model on entire training set

After that, we use the same to generate predictions on the test set provided by kaggle as well.

In [57]:
# We use a pipeline to make things easire
from sklearn.linear_model import SGDClassifier
best_model_svc = Pipeline([('vect', TfidfVectorizer()),
                           ('transformer', TfidfTransformer(use_idf=False)),
                           ('classify', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, 
                                                      random_state=0, max_iter=5, tol=None))])

In [58]:
best_model_svc.fit(train_data['Ticket'], train_data['Survived'])

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...lty='l2', power_t=0.5, random_state=0, shuffle=True,
       tol=None, verbose=0, warm_start=False))])

In [59]:
# Generate out of sample predictions
predictions = best_model_svc.predict(test_data['Ticket'])

In [60]:
test_data['Predictions'] = predictions

In [61]:
kaggle_data = test_data[['PassengerId', 'Predictions']].copy()
kaggle_data.rename(columns={'Predictions' : 'Survived'}, inplace=True)
kaggle_data.sort_values(by=['PassengerId']).to_csv('kaggle_out_svc_tickets.csv', index=False)