# Titanic Solution with Pipeline 
- Build a model using Pipeline (make_pipeline)
- Use Column Transformers (make_column_transformer) to transform subsets of Variables
- Use (SimpleImputer) to treat Missing Values in both Numeric and Categorical Variables
- Use (KBinsDiscretizer, OneHotEncoder) for Binning and creating Dummy Variables
- Build our Model using Logistic Regression
- Measure Accuracy of the Model

## Import Libraries

In [2]:
import pandas as pd

import sklearn.model_selection as model_selection

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

#For Missing Value Treatment
from sklearn.impute import SimpleImputer

#For Binning and creating Dummy Variables
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score

## Import Data

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Check Information of Columns

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## Divide Data into Target and X Variables

In [5]:
y = train['Survived']
X = train.drop(['Survived','PassengerId'], axis = 1)

## Split Titanic Train data into Train and Test Dataset

In [6]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.2, random_state = 200)

## Classify Features based on how we want to deal with them

In [7]:
numeric_spl_features = ['Pclass','SibSp', 'Parch']

object_spl_features = ['Name', 'Ticket', 'Cabin']

numerical_features = [c for c, dtype in zip(X.columns, X.dtypes)
                     if dtype.kind in ['i','f'] and c not in numeric_spl_features]
categorical_features = [c for c, dtype in zip(X.columns, X.dtypes)
                     if dtype.kind not in ['i','f'] and c not in object_spl_features]

#cabin_spl_feature = ['Cabin']

#name_spl_feature = ['Name']

#ticket_spl_feature = ['Ticket']

In [8]:
print('Numerical : ' + str(numerical_features))
print('Categorical : ' + str(categorical_features))
print('Numeric Special : ' + str(numeric_spl_features))

Numerical : ['Age', 'Fare']
Categorical : ['Sex', 'Embarked']
Numeric Special : ['Pclass', 'SibSp', 'Parch']


## Make Column Transformer with multiple Pipeline

In [9]:
preprocessor = make_column_transformer(
    
    (make_pipeline(
    SimpleImputer(strategy = 'median'),
    KBinsDiscretizer(n_bins=4)), numerical_features),
    
    (make_pipeline(
    OneHotEncoder(categories = 'auto',handle_unknown = 'ignore')), numeric_spl_features),
    
    (make_pipeline(
    SimpleImputer(strategy = 'most_frequent'),
    OneHotEncoder(categories = 'auto', handle_unknown = 'ignore')), categorical_features),
    
)

## Make Logistic Model Pipeline

In [10]:
logModel = make_pipeline(preprocessor, LogisticRegression(solver = 'saga', max_iter = 1000))

## Check Train and Test Scores

In [11]:
#Train_Score
train_scores = cross_val_score(logModel, X_train, y_train, cv = 7)

#Test Score
test_scores = cross_val_score(logModel, X_test, y_test, cv = 7)

#Print Train and Test Score
print(f'Train Accuracy : {train_scores.mean():.3f} +/- {train_scores.std():.2f}')
print(f'Test Accuracy : {test_scores.mean():.3f} +/- {test_scores.std():.2f}')

Train Accuracy : 0.810 +/- 0.02
Test Accuracy : 0.738 +/- 0.06


## Fit Model 

In [12]:
logModel.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline-1',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                           

## Extract only X Variables for Prediction

In [13]:
X_submission = test.drop(['PassengerId'], axis = 1)
X_submission.columns

Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked'],
      dtype='object')

## Make Prediction and Export CSV

In [14]:
prediction = logModel.predict(X_submission).astype(int)

In [None]:
#Create Submission dataframe 
submission_df = pd.DataFrame({'PassengerId' : test['PassengerId'], 'Survived' : prediction})

submission_df.to_csv('LOG_Model_Pipeline.csv', index = False)

In [None]:
## Scores 0.77033 when Submitted to Kaggle