In [1]:
import pandas as pd
import numpy as np

In [46]:
print('Reading files...')

train_raw_df = pd.read_csv('train.csv')
print('Shape of train.csv:', train_raw_df.shape)

test_raw_df = pd.read_csv('test.csv')
print('Shape of test.csv:', test_raw_df.shape)

y_test = pd.read_csv('gender_submission.csv')['Survived']
print('Shape of gender_submission.csv:', y_test.shape)

Reading files...
Shape of train.csv  (891, 12)
Shape of test.csv (418, 11)
Shape of gender_submission.csv:  (418,)


Combine both train and test data to easy feature engineering and transformation

In [49]:
data = train_raw_df.append(test_raw_df, sort=False)

In [54]:
print(data.info())
print('\nHead of data:\n', data.head())
print('\n\nTail of data:\n', data.tail())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
PassengerId    1309 non-null int64
Survived       891 non-null float64
Pclass         1309 non-null int64
Name           1309 non-null object
Sex            1309 non-null object
Age            1046 non-null float64
SibSp          1309 non-null int64
Parch          1309 non-null int64
Ticket         1309 non-null object
Fare           1308 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB
None

Head of data:
    PassengerId  Survived  Pclass  \
0            1       0.0       3   
1            2       1.0       1   
2            3       1.0       3   
3            4       1.0       1   
4            5       0.0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cuming

In [66]:
# Find out missing data
null_total = data.isnull().sum().sort_values(ascending=False)
null_percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
data_null = pd.concat([null_total, null_percent], axis=1, keys=['Total', 'Percent'])
data_null.head(6)

Unnamed: 0,Total,Percent
Cabin,1014,0.774637
Survived,418,0.319328
Age,263,0.200917
Embarked,2,0.001528
Fare,1,0.000764
Ticket,0,0.0


In [6]:
import re
# Extract title from name (e.g.'Mr.' from 'Kelly, Mr. James')
def get_title_from_name(name):
    searchObj = re.search(r'\, ([a-zA-Z])+\. ', name, re.I)
    if searchObj and searchObj.group() and len(searchObj.group())>1:
        title = searchObj.group()[1:].strip()  # Extract 'Mr.' from ', Mr. '
        if title in ['Mr.', 'Miss.', 'Master.', 'Ms.', 'Dr.', 'Mrs.', 'Col.', 'Dona.', 'Rev.']:
            return title
    return np.NaN

def feat_eng_title(df):
    column_name = 'Title'
    column_value = df['Name'].apply(lambda x: get_title_from_name(x))
    
    if column_name in df.columns:
        df = df.drop(column_name, axis=1).reset_index(drop=True)
    
    df.insert(loc=2, column=column_name, value=column_value, allow_duplicates=False)
    return df
    
# data['Title'] = data['Name'].apply(lambda x: get_title_from_name(x))

# data = feat_eng_title(data)
# data.head()

In [7]:
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if str(big_string).find(substring) != -1:
            return substring
    return np.nan

def feat_eng_deck(df):
    deck_list = ['A','B','C','D','E','F','G']
#     df['Deck'] = df['Cabin'].apply(lambda x: substrings_in_string(x, deck_list))
    
    column_name = 'Deck'
    column_value = df['Cabin'].apply(lambda x: substrings_in_string(x, deck_list))
    
    if column_name in df.columns:
        df = df.drop(column_name, axis=1).reset_index(drop=True)
    
    df.insert(loc=11, column=column_name, value=column_value, allow_duplicates=False)    
    return df
    
# data['Cabin'][data['Cabin'].notnull()]
# deck_list = ['A','B','C','D','E','F','G']
# data['Deck'] = data['Cabin'].apply(lambda x: substrings_in_string(x, deck_list))

# data = feat_eng_deck(data)
# data[['Deck', 'Cabin']][data['Cabin'].notnull()]

In [8]:
#Creating new family_size column
def feat_eng_family_size(df):
#     df['Family_Size'] = df['SibSp'] + df['Parch']
    column_name = 'Family_Size'
    column_value = df['SibSp'] + df['Parch']
    
    if column_name in df.columns:
        df = df.drop(column_name, axis=1).reset_index(drop=True)
        
    df.insert(loc=8, column=column_name, value=column_value, allow_duplicates=False)    
    return df

# data['Family_Size'] = data['SibSp'] + data['Parch']

# data = feat_eng_family_size(data)
# data

In [9]:
from sklearn import preprocessing

def convert_str_to_int(df):
    string_columns = ['Title', 'Name', 'Sex', 'Ticket', 'Cabin', 'Deck', 'Embarked']
    for column in string_columns:
        df[column] = preprocessing.LabelEncoder().fit_transform(df[column].astype(str))
    return df


In [30]:
def feat_eng_all(df):
    df = feat_eng_title(df)
    df = feat_eng_deck(df)
    df = feat_eng_family_size(df)
    df = convert_str_to_int(df)
    df['Age'] = np.nan_to_num(df['Age'])
    df['Fare'] = np.round(np.nan_to_num(df['Fare']), decimals = 0).astype('int64')
    return df

X_train = feat_eng_all(data.drop('Survived', axis=1))
y_train = data['Survived']
print('X_train info:\n{}'.format(X_train.info()))
print('\nX_train head:\n{}'.format(X_train.head(50)))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
PassengerId    891 non-null int64
Pclass         891 non-null int64
Title          891 non-null int64
Name           891 non-null int64
Sex            891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Family_Size    891 non-null int64
Ticket         891 non-null int64
Fare           891 non-null int64
Cabin          891 non-null int64
Deck           891 non-null int64
Embarked       891 non-null int64
dtypes: float64(1), int64(13)
memory usage: 97.5 KB
X_train info:
None

X_train head:
    PassengerId  Pclass  Title  Name  Sex   Age  SibSp  Parch  Family_Size  \
0             1       3      4   108    1  22.0      1      0            1   
1             2       1      5   190    0  38.0      1      0            1   
2             3       3      3   353    0  26.0      0      0            0   
3             4    

In [38]:
X_test = feat_eng_all(pd.read_csv('test.csv'))
y_test = pd.read_csv('gender_submission.csv')['Survived']
print('X_test info:\n{}'.format(X_test.info()))
print('\nX_test head:\n{}'.format(X_test.head(50)))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 14 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Title          418 non-null int64
Name           418 non-null int64
Sex            418 non-null int64
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Family_Size    418 non-null int64
Ticket         418 non-null int64
Fare           418 non-null int64
Cabin          418 non-null int64
Deck           418 non-null int64
Embarked       418 non-null int64
dtypes: float64(1), int64(13)
memory usage: 45.8 KB
X_test info:
None

X_test head:
    PassengerId  Pclass  Title  Name  Sex   Age  SibSp  Parch  Family_Size  \
0           892       3      5   206    1  34.5      0      0            0   
1           893       3      6   403    0  47.0      1      0            1   
2           894       2      5   269    1  62.0      0      0            0   
3           895      

In [39]:
from sklearn.neighbors import KNeighborsClassifier 
clf = KNeighborsClassifier(n_neighbors=2)
clf.fit(X_train , y_train)
print('Train set accuracy: {:.2f}'.format(clf.score(X_train, y_train)))

# prediction = clf.predict(X_test)
# print('\nTest set prediction:\n', prediction)
print('Test set accuracy: {:.2f}'.format(clf.score(X_test, y_test)))

Train set accuracy: 0.81
Test set accuracy: 0.55


In [43]:
from sklearn.ensemble import RandomForestClassifier
SrchRFC = RandomForestClassifier(max_depth = 5, min_samples_split = 4, n_estimators = 500, 
                                 random_state = 20, n_jobs = -1)
SrchRFC.fit(X_train, y_train) 
print('Train set accuracy: {:.2f}'.format(SrchRFC.score(X_train, y_train)))
print('Test set accuracy: {:.2f}'.format(SrchRFC.score(X_test, y_test)))

Train set accuracy: 0.87
Test set accuracy: 0.86
