In [31]:
#Load libraries
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import classification_report
from sklearn.feature_selection import RFE
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

In [32]:
#Load dataset
train = pd.read_csv('https://raw.githubusercontent.com/akulapa/Data622-HW2/master/train.csv')

In [33]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [34]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [35]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [36]:
X_train = train.drop('Survived', axis=1)
y_train = train['Survived']

In [37]:
class derivedColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, *_):
        return self
    
    def transform(self, X, *_):
        
        df = X.copy()
        
        def cleanTicket(ticket):
            ticket = ticket.replace('.','')
            ticket = ticket.replace('/','')
            ticket = ticket.split()
            ticket = map(lambda t : t.strip(), ticket)
            ticket = filter(lambda t : t.isdigit(), ticket)
            ticket = ''.join(ticket)
            if len(ticket) > 0:
                return ticket
            else: 
                return np.nan
        
        if self.columns is not None:
            for column in self.columns:
                
                if column == 'Name':
                    #Wikipedia suggests "Jonkheer" and "Countess" is honorific titile
                    #https://en.wikipedia.org/wiki/Jonkheer
                    #https://en.wikipedia.org/wiki/Count
                    #Create dictionary and map them more generic values

                    Title_Dictionary = {
                        "Capt": "Officer",
                        "Col": "Officer",
                        "Major": "Officer",
                        "Jonkheer": "Royalty",
                        "Don": "Royalty",
                        "Sir" : "Royalty",
                        "Dr": "Officer",
                        "Rev": "Officer",
                        "the Countess":"Royalty",
                        "Mme": "Mrs",
                        "Mlle": "Miss",
                        "Ms": "Mrs",
                        "Mr" : "Mr",
                        "Mrs" : "Mrs",
                        "Miss" : "Miss",
                        "Master" : "Master",
                        "Lady" : "Royalty"
                    }

                    #Split name as comma seperated values and get second value(1) from the list
                    #Split the value again based on seperator(.), get first value(0)
                    #ultimately get prefix value
                    df['Title'] = df['Name'].map(lambda Name:Name.split(',')[1].split('.')[0].strip())
                    df['Title'] = df.Title.map(Title_Dictionary)

                if column == 'Ticket':
                    # map cleanTicket function and extract the value for each row:
                    df['TicketNumber'] = df['Ticket'].map(cleanTicket)

                if column == 'Family':
                    #Get number of members in the family
                    #Get family size
                    df['Family'] = (df['Parch'] + df['SibSp']).map(lambda s: 'Single' if s <= 1 else 'Small' if s >= 2 and s < 5 else 'Large' if s >= 5 else 'Unknown')
                    
                if column == 'Deck':
                    #Extract first letter from Cabin value, store it into new column 'Deck'
                    df['Deck'] = df.Cabin.str[0]
       
        return df
    
class eliminateColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, *_):
        return self
    
    def transform(self, X, *_):
        
        df = X.copy()
        
        if self.columns is not None:
            #Drop the variables
            df.drop(self.columns, inplace=True, axis=1)
       
        return df

class imputeColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, *_):
        return self
    
    def transform(self, X, *_):
        
        df = X.copy()
        
        if self.columns is not None:
            for column in self.columns:
                if column == 'Title':
                    df['Title'] = df['Title'].fillna('Miss')
                if column == 'Fare':
                    fareMedian = df[((df['Fare'].notnull()) & (df['Embarked']=='S'))].Fare.median()
                    df['Fare'] = df['Fare'].fillna(float(fareMedian))
                if column == 'Deck':
                    df['Deck'] = df['Deck'].fillna('U')
                if column == 'TicketNumber':
                    #Get median value
                    #Replace missing ticketNumber by median values
                    ticketMedian = df[df['TicketNumber'].notnull()].TicketNumber.median()
                    df['TicketNumber'] = df['TicketNumber'].fillna(int(ticketMedian))
                if column == 'Embarked':
                    #As fare matches the average fare impute missing value with 'C'
                    df['Embarked'] = df['Embarked'].fillna('C')
                if column == 'Age':

                    #Convet values into numeric categorical values
                    df['Sex'] = df['Sex'].map({'female':0, 'male':1})
                    df['Family'] = df['Family'].map({'Single':0, 'Small':1, 'Large':2})
                    df['Title'] = df['Title'].map({'Mrs':0, 'Miss':1, 'Mr':2, 'Master':3, 'Officer':4, 'Royalty':5})
                    df['Embarked'] = df['Embarked'].map({'C':0, 'Q':1, 'S':2})
                    df['Deck'] = df['Deck'].map({'A':0, 'B':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6, 'T':7, 'U':8})

                    #Convert variables into categorical variables
                    df['Sex'] = df['Sex'].astype('category')
                    df['Family'] = df['Family'].astype('category')
                    df['Title'] = df['Title'].astype('category')
                    df['Embarked'] = df['Embarked'].astype('category')
                    df['Deck'] = df['Deck'].astype('category')
                    df['Pclass'] = df['Pclass'].astype('category')

                    #Get observations with non NA values for Age
                    train_age = df.loc[(df.Age.notnull())]

                    #Get observations with NA values for Age
                    test_age = df.loc[(df.Age.isnull())]

                    #Seperate X and y values for training dataset
                    X_age_train = train_age.drop('Age', axis=1).values
                    y_age_train = train_age['Age'].values

                    #As we will imputing missing Age values, drop the column from test dataset
                    X_age_test = test_age.drop('Age', axis=1).values

                    #Lets use all parameters as default, with number trees as 500 in random forest regressor
                    ramdomForest = RandomForestRegressor(n_estimators=500, n_jobs=-1)
                    ramdomForest.fit(X_age_train, y_age_train)

                    #Predict the missing values
                    predictedAges = ramdomForest.predict(X_age_test)

                    #Apply to the dataset
                    df.loc[(df.Age.isnull()), 'Age'] = predictedAges


        return df

In [38]:
derived_cols = ['Name', 'Ticket', 'Family', 'Deck']
elmi_cols = ['Name', 'Ticket', 'Cabin', 'PassengerId']
imp_cols = ['Title', 'Fare', 'Deck', 'TicketNumber','Embarked','Age']

pipeline = Pipeline([
    
    ('d_cols', derivedColumns( derived_cols )),
    ('e_cols', eliminateColumns( elmi_cols )),
    ('i_cols', imputeColumns( imp_cols )),
])


In [39]:
A=pipeline.fit_transform(X_train)
print(A.shape)
A

(891, 11)


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,TicketNumber,Family,Deck
0,3,1,22.000000,1,0,7.2500,2,2,21171,0,8
1,1,0,38.000000,1,0,71.2833,0,0,17599,0,2
2,3,0,26.000000,0,0,7.9250,2,1,3101282,0,8
3,1,0,35.000000,1,0,53.1000,2,0,113803,0,2
4,3,1,35.000000,0,0,8.0500,2,2,373450,0,8
5,3,1,29.974000,0,0,8.4583,1,2,330877,0,8
6,1,1,54.000000,0,0,51.8625,2,2,17463,0,4
7,3,1,2.000000,3,1,21.0750,2,3,349909,1,8
8,3,0,27.000000,0,2,11.1333,2,0,347742,1,8
9,2,0,14.000000,1,0,30.0708,0,0,237736,0,8


In [40]:
print("Variables with missing values")
A.isnull().sum()

Variables with missing values


Pclass          0
Sex             0
Age             0
SibSp           0
Parch           0
Fare            0
Embarked        0
Title           0
TicketNumber    0
Family          0
Deck            0
dtype: int64

In [41]:
pil = Pipeline([
    ('final_dataset', pipeline),
    ('clsr', RandomForestRegressor(n_estimators=500, n_jobs=-1, random_state=42))
])

pil.fit(X_train, y_train)
y_preds = pil.predict(X_train)

y_preds


array([ 0.022     ,  0.996     ,  0.848     ,  0.95      ,  0.        ,
        0.06      ,  0.168     ,  0.192     ,  0.76      ,  0.962     ,
        0.854     ,  0.874     ,  0.078     ,  0.006     ,  0.214     ,
        0.846     ,  0.064     ,  0.494     ,  0.196     ,  0.906     ,
        0.016     ,  0.798     ,  0.784     ,  0.754     ,  0.078     ,
        0.642     ,  0.004     ,  0.154     ,  0.986     ,  0.        ,
        0.084     ,  0.974     ,  0.914     ,  0.008     ,  0.088     ,
        0.058     ,  0.33      ,  0.022     ,  0.102     ,  0.774     ,
        0.08      ,  0.414     ,  0.042     ,  0.902     ,  0.988     ,
        0.104     ,  0.        ,  0.91      ,  0.158     ,  0.158     ,
        0.006     ,  0.08      ,  0.998     ,  0.994     ,  0.072     ,
        0.71      ,  1.        ,  0.044     ,  0.996     ,  0.012     ,
        0.09      ,  1.        ,  0.05      ,  0.056     ,  0.162     ,
        0.93      ,  1.        ,  0.232     ,  0.824     ,  0.12