In [218]:
#Load libraries
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import classification_report
from sklearn.feature_selection import RFE
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

In [219]:
#Load dataset
train = pd.read_csv('https://raw.githubusercontent.com/akulapa/Data622-HW2/master/test.csv')

In [220]:
train.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [221]:
train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [222]:
train.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [223]:
class derivedColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, *_):
        return self
    
    def transform(self, X, *_):
        
        df = X.copy()
        
        def cleanTicket(ticket):
            ticket = ticket.replace('.','')
            ticket = ticket.replace('/','')
            ticket = ticket.split()
            ticket = map(lambda t : t.strip(), ticket)
            ticket = filter(lambda t : t.isdigit(), ticket)
            ticket = ''.join(ticket)
            if len(ticket) > 0:
                return ticket
            else: 
                return np.nan
        
        if self.columns is not None:
            for column in self.columns:
                
                if column == 'Name':
                    #Wikipedia suggests "Jonkheer" and "Countess" is honorific titile
                    #https://en.wikipedia.org/wiki/Jonkheer
                    #https://en.wikipedia.org/wiki/Count
                    #Create dictionary and map them more generic values

                    Title_Dictionary = {
                        "Capt": "Officer",
                        "Col": "Officer",
                        "Major": "Officer",
                        "Jonkheer": "Royalty",
                        "Don": "Royalty",
                        "Sir" : "Royalty",
                        "Dr": "Officer",
                        "Rev": "Officer",
                        "the Countess":"Royalty",
                        "Mme": "Mrs",
                        "Mlle": "Miss",
                        "Ms": "Mrs",
                        "Mr" : "Mr",
                        "Mrs" : "Mrs",
                        "Miss" : "Miss",
                        "Master" : "Master",
                        "Lady" : "Royalty"
                    }

                    #Split name as comma seperated values and get second value(1) from the list
                    #Split the value again based on seperator(.), get first value(0)
                    #ultimately get prefix value
                    df['Title'] = df['Name'].map(lambda Name:Name.split(',')[1].split('.')[0].strip())
                    df['Title'] = df.Title.map(Title_Dictionary)

                if column == 'Ticket':
                    # map cleanTicket function and extract the value for each row:
                    df['TicketNumber'] = df['Ticket'].map(cleanTicket)

                if column == 'Family':
                    #Get number of members in the family
                    #Get family size
                    df['Family'] = (df['Parch'] + df['SibSp']).map(lambda s: 'Single' if s <= 1 else 'Small' if s >= 2 and s < 5 else 'Large' if s >= 5 else 'Unknown')
                    
                if column == 'Deck':
                    #Extract first letter from Cabin value, store it into new column 'Deck'
                    df['Deck'] = df.Cabin.str[0]
       
        return df
    
class eliminateColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, *_):
        return self
    
    def transform(self, X, *_):
        
        df = X.copy()
        
        if self.columns is not None:
            #Drop the variables
            df.drop(self.columns, inplace=True, axis=1)
       
        return df
    
class imputeColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, *_):
        return self
    
    def transform(self, X, *_):
        
        df = X.copy()
        
        if self.columns is not None:
            for column in self.columns:
                if column == 'Title':
                    df['Title'] = df['Title'].fillna('Miss')
                if column == 'Fare':
                    fareMedian = df[((df['Fare'].notnull()) & (df['Embarked']=='S'))].Fare.median()
                    df['Fare'] = df['Fare'].fillna(float(fareMedian))
                if column == 'Deck':
                    df['Deck'] = df['Deck'].fillna('U')
                if column == 'Age':

                    #Convet values into numeric categorical values
                    df['Sex'] = df['Sex'].map({'female':0, 'male':1})
                    df['Family'] = df['Family'].map({'Single':0, 'Small':1, 'Large':2})
                    df['Title'] = df['Title'].map({'Mrs':0, 'Miss':1, 'Mr':2, 'Master':3, 'Officer':4, 'Royalty':5})
                    df['Embarked'] = df['Embarked'].map({'C':0, 'Q':1, 'S':2})
                    df['Deck'] = df['Deck'].map({'A':0, 'B':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6, 'T':7, 'U':8})

                    #Convert variables into categorical variables
                    df['Sex'] = df['Sex'].astype('category')
                    df['Family'] = df['Family'].astype('category')
                    df['Title'] = df['Title'].astype('category')
                    df['Embarked'] = df['Embarked'].astype('category')
                    df['Deck'] = df['Deck'].astype('category')
                    df['Pclass'] = df['Pclass'].astype('category')

                    #Get observations with non NA values for Age
                    train_age = df.loc[(df.Age.notnull())]

                    #Get observations with NA values for Age
                    test_age = df.loc[(df.Age.isnull())]

                    #Seperate X and y values for training dataset
                    X_train = train_age.drop('Age', axis=1).values
                    y_train = train_age['Age'].values

                    #As we will imputing missing Age values, drop the column from test dataset
                    X_test = test_age.drop('Age', axis=1).values

                    #Lets use all parameters as default, with number trees as 500 in random forest regressor
                    ramdomForest = RandomForestRegressor(n_estimators=500, n_jobs=-1)
                    ramdomForest.fit(X_train, y_train)

                    #Predict the missing values
                    predictedAges = ramdomForest.predict(X_test)

                    #Apply to the dataset
                    df.loc[(df.Age.isnull()), 'Age'] = predictedAges


        return df

In [224]:
derived_cols = ['Name', 'Ticket', 'Family', 'Deck']
elmi_cols = ['Name', 'Ticket', 'Cabin', 'PassengerId']
imp_cols = ['Title', 'Fare', 'Deck', 'Age']

pipeline = Pipeline([
    
    ('d_cols', derivedColumns( derived_cols )),
    ('e_cols', eliminateColumns( elmi_cols )),
    ('i_cols', imputeColumns( imp_cols ))
])


In [225]:
A=pipeline.fit_transform(train)
print(A.shape)
A

(418, 11)


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,TicketNumber,Family,Deck
0,3,1,34.500000,0,0,7.8292,1,2,330911,0,8
1,3,0,47.000000,1,0,7.0000,2,0,363272,0,8
2,2,1,62.000000,0,0,9.6875,1,2,240276,0,8
3,3,1,27.000000,0,0,8.6625,2,2,315154,0,8
4,3,0,22.000000,1,1,12.2875,2,0,3101298,1,8
5,3,1,14.000000,0,0,9.2250,2,2,7538,0,8
6,3,0,30.000000,0,0,7.6292,1,1,330972,0,8
7,2,1,26.000000,1,1,29.0000,2,2,248738,1,8
8,3,0,18.000000,0,0,7.2292,0,0,2657,0,8
9,3,1,21.000000,2,0,24.1500,2,2,48871,1,8


In [226]:
print("Variables with missing values")
A.isnull().sum()

Variables with missing values


Pclass          0
Sex             0
Age             0
SibSp           0
Parch           0
Fare            0
Embarked        0
Title           0
TicketNumber    0
Family          0
Deck            0
dtype: int64