In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedKFold
import numpy as np
import re

**First, let the code read dataframes**

In [5]:
df_train = pd.DataFrame(pd.read_csv('train.csv'))
df_test = pd.DataFrame(pd.read_csv('test.csv'))

In [6]:
# This code finds how many NaN value our dataframe has
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
# Printing mean value of the Age for Pclass and Sex
print(df_train.groupby(['Pclass']).mean()['Age'])
print('\n')
print(df_train.groupby(['Sex']).mean()['Age'])

Pclass
1    38.233441
2    29.877630
3    25.140620
Name: Age, dtype: float64


Sex
female    27.915709
male      30.726645
Name: Age, dtype: float64


In [10]:
# Defining a function for filling NaN files with mean values.
def age_nan(df):
    for i in df.Sex.unique():
        for j in df.Pclass.unique():
            x = df.loc[((df.Sex == i) & (df.Pclass == j)), 'Age'].mean()
            df.loc[((df.Sex == i) & (df.Pclass == j)), 'Age'] = df.loc[
                ((df.Sex == i) & (df.Pclass == j)), 'Age'].fillna(x)

In [11]:
age_nan(df_train)
age_nan(df_test)

In [13]:
# Filling NaN files of Embarked with the common value "S"
df_train['Embarked'] = df_train['Embarked'].fillna('S')
df_test['Embarked'] = df_test['Embarked'].fillna('S')

df_train['Cabin_NaN'] = df_train['Cabin'].isnull().astype(int)
df_test['Cabin_NaN'] = df_test['Cabin'].isnull().astype(int)

# Filling NaN files of Fare with -1 (Only NaN value is -1)

df_test.isnull().sum()

df_test.Fare = df_test.Fare.fillna(-1)

In [14]:
# Defining a fuction for logistic regression - cross validation function
def reg_cross_val(variables):
    X = df_train[variables]
    y = df_train['Survived']

    rkfold = RepeatedKFold(n_splits=2, n_repeats=10, random_state=10)
    result = []
    for treino, teste in rkfold.split(X):
        X_train, X_test = X.iloc[treino], X.iloc[teste]
        y_train, y_test = y.iloc[treino], y.iloc[teste]

        reg = LogisticRegression(max_iter=500)
        reg.fit(X_train, y_train)
        result.append(reg.score(X_test, y_test))

    return np.mean(result)

In [15]:
# Creating Sex_bin column for cross validation
def is_female(x):
    if x == 'female':
        return 1
    else:
        return 0
    
df_train['Sex_bin'] = df_train['Sex'].map(is_female)
df_test['Sex_bin'] = df_test['Sex'].map(is_female)

In [17]:
# Creating Embarked_S and C columns for cross validation
def embarked_s(x):
    if x == 'S':
        return 1
    else:
        return 0

df_train['Embarked_S'] = df_train['Embarked'].map(embarked_s)
df_test['Embarked_S'] = df_test['Embarked'].map(embarked_s)

def embarked_c(x):
    if x == 'C':
        return 1
    else:
        return 0
    
df_train['Embarked_C'] = df_train['Embarked'].map(embarked_c)
df_test['Embarked_C'] = df_test['Embarked'].map(embarked_c)

In [18]:
# Variable Defining for fist test
variables_before = ['Age', 'Pclass', 'Fare', 'SibSp', 'Parch']
print('Before the new features:', reg_cross_val(variables_before))

variables = ['Age', 'Sex_bin', 'Pclass', 'Fare', 'SibSp', 'Parch', 'Embarked_S',\
             'Embarked_C', 'Cabin_NaN']

print('With the new features:', reg_cross_val(variables))

Before the new features: 0.7006696226129894
With the new features: 0.8031352345442635


In [19]:
# Creating Family column
df_train['Family'] = df_train.SibSp + df_train.Parch
df_test['Family'] = df_test.SibSp + df_test.Parch

variables = ['Age', 'Sex_bin', 'Pclass', 'Fare', 'Embarked_S', \
             'Embarked_C', 'Cabin_NaN', 'Family']

reg_cross_val(variables)

0.8017879276464956

In [20]:
#Finding most common Ticket values
text_ticket = ''
for i in df_train.Ticket:
    text_ticket += i

listt = re.findall('[a-zA-Z]+', text_ticket)
print('Most repeated terms in Tickets: \n')
print(pd.Series(listt).value_counts().head(10))

# Creating features for most common Ticket values
df_train['CA'] = df_train['Ticket'].str.contains('CA|C.A.').astype(int)
df_train['SOTON'] = df_train['Ticket'].str.contains('SOTON|STON').astype(int)
df_train['PC'] = df_train['Ticket'].str.contains('PC').astype(int)
df_train['SC'] = df_train['Ticket'].str.contains('SC|S.C').astype(int)
df_train['C'] = df_train['Ticket'].str.contains('C').astype(int)


df_test['CA'] = df_test['Ticket'].str.contains('CA|C.A.').astype(int)
df_test['SOTON'] = df_test['Ticket'].str.contains('SOTON|STON').astype(int)
df_test['PC'] = df_test['Ticket'].str.contains('PC').astype(int)
df_test['SC'] = df_test['Ticket'].str.contains('SC|S.C').astype(int)
df_test['C'] = df_test['Ticket'].str.contains('C').astype(int)

Most repeated terms in Tickets: 

C        63
PC       60
A        58
O        37
STON     18
SOTON    18
S        15
CA       13
W        13
SC       13
dtype: int64


In [21]:
#Finding most common Name values
text_name = ''
for i in df_train.Name:
    text_name += i

listt = re.findall('[a-zA-Z]+', text_name)
print('Most repeated words in Name column: \n')
print(pd.Series(listt).value_counts().head(10))

# Creating features for most common Name values
df_train['Master'] = df_train['Name'].str.contains('Master').astype(int)
df_train['Mr'] = df_train['Name'].str.contains('Mr').astype(int)
df_train['Miss'] = df_train['Name'].str.contains('Miss').astype(int)
df_train['Mrs'] = df_train['Name'].str.contains('Mrs').astype(int)


df_test['Master'] = df_test['Name'].str.contains('Master').astype(int)
df_test['Mr'] = df_test['Name'].str.contains('Mr').astype(int)
df_test['Miss'] = df_test['Name'].str.contains('Miss').astype(int)
df_test['Mrs'] = df_test['Name'].str.contains('Mrs').astype(int)

Most repeated words in Name column: 

Mr         521
Miss       182
Mrs        129
William     49
Master      40
John        28
Henry       19
Thomas      17
Charles     17
George      16
dtype: int64


In [22]:
#Variable defining for second test
variables = ['Age', 'Sex_bin', 'Pclass', 'Fare', 'Embarked_S', 'Embarked_C', \
             'CA', 'SOTON', 'PC', 'SC', 'C', 'Mr', 'Miss', 'Master', 'Mrs', 'Family']

print(reg_cross_val(variables))

0.825021917670177


In [24]:
#Prediction of survival for people in the test.csv dataset
variables = ['Age', 'Sex_bin', 'Pclass', 'Fare', 'Family', 'Embarked_S', 'Embarked_C', 'Cabin_NaN', \
             'CA', 'SOTON', 'PC', 'SC', 'Master', 'Mr', 'Miss', 'C', 'Mrs']

X = df_train[variables]
y = df_train['Survived']

reg = LogisticRegression(max_iter=500)
reg.fit(X, y)
resp = reg.predict(df_test[variables])
resp

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

**Thank you for your attention**