In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Load training and test data

In [2]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

# mark training/test data
train_data["is_train"]=1 
test_data["is_train"]=0

# merge training/test data to all_data
all_data=train_data.append(test_data, ignore_index=True)

all_data.head() # Pclass: 1 = 1st, 2 = 2nd, 3 = 3rd

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_train
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1


In [3]:
all_data.info()
all_data.describe() # some missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
 12  is_train     1309 non-null   int64  
dtypes: float64(3), int64(5), object(5)
memory usage: 133.1+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,is_train
count,1309.0,891.0,1309.0,1046.0,1309.0,1309.0,1308.0,1309.0
mean,655.0,0.383838,2.294882,29.881138,0.498854,0.385027,33.295479,0.680672
std,378.020061,0.486592,0.837836,14.413493,1.041658,0.86556,51.758668,0.466394
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.8958,0.0
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542,1.0
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275,1.0
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292,1.0


# Data wrangling

## Titles

Extract titles of passengers from name (by grabbing the word before a [](http://)dot in name)

In [4]:
all_data["Title"] = all_data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

In [5]:
pd.crosstab(all_data['Title'], all_data['Sex'], margins=True)

Sex,female,male,All
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Capt,0,1,1
Col,0,4,4
Countess,1,0,1
Don,0,1,1
Dona,1,0,1
Dr,1,7,8
Jonkheer,0,1,1
Lady,1,0,1
Major,0,2,2
Master,0,61,61


Simplify/group titles

In [6]:
group_titles = {
    "Capt": "Officer",
    "Col": "Officer",
    "Countess": "Noble",
    "Don": "Noble",
    "Dona": "Noble",
    "Dr": "Professional",
    "Jonkheer": "Noble",
    "Lady": "Noble",
    "Major": "Officer",
    "Master": "Master", # younglings?
    "Miss": "Miss",
    "Mlle": "Mrs", # Sagesser, Mlle. Emma	female	24.0, Mayne, Mlle. Berthe Antonine ("Mrs de Villiers") female	24.0	
    "Mme": "Mrs", # Aubart, Mme. Leontine Pauline	female	24.0
    "Mr": "Mr",
    "Mrs": "Mrs",
    "Ms": "Mrs",
    "Rev": "Professional",
    "Sir": "Mr"
}
all_data.Title = all_data.Title.map(group_titles)
pd.crosstab(all_data['Title'], all_data['Sex'], margins=True)

Sex,female,male,All
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Master,0,61,61
Miss,260,0,260
Mr,0,758,758
Mrs,202,0,202
Noble,3,2,5
Officer,0,7,7
Professional,1,15,16
All,466,843,1309


## Sex

Recode male/female to 0/1

In [7]:
all_data.Sex = all_data.Sex.map( {"male":0, "female":1})

## Cabin

Keep first letter in cabin name, fill missing values with U (unknown). Idea is that different Cabins corresponds to different locations in the ship and have different chance of survival.

In [8]:
#all_data.info()
pd.crosstab(all_data['Cabin'], all_data['Survived'], margins=True) # many in Cabin B-F seem to have survived

Survived,0.0,1.0,All
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A10,1,0,1
A14,1,0,1
A16,0,1,1
A19,1,0,1
A20,0,1,1
...,...,...,...
F38,1,0,1
F4,0,2,2
G6,2,2,4
T,1,0,1


In [9]:
all_data.Cabin = all_data.Cabin.fillna('U').map(lambda x:x[0])

## Embarked

Fill missing value for column embarked with most common value (mode)

In [10]:
from statistics import mode
all_data.Embarked = all_data.Embarked.fillna(mode(all_data.Embarked))

## Fare

Fill in missing values for Fare, replace values missing values with median for respective pclass and sex.

In [11]:
all_data.loc[all_data.Fare == 0]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_train,Title
179,180,0.0,3,"Leonard, Mr. Lionel",0,36.0,0,0,LINE,0.0,U,S,1,Mr
263,264,0.0,1,"Harrison, Mr. William",0,40.0,0,0,112059,0.0,B,S,1,Mr
271,272,1.0,3,"Tornquist, Mr. William Henry",0,25.0,0,0,LINE,0.0,U,S,1,Mr
277,278,0.0,2,"Parkes, Mr. Francis ""Frank""",0,,0,0,239853,0.0,U,S,1,Mr
302,303,0.0,3,"Johnson, Mr. William Cahoone Jr",0,19.0,0,0,LINE,0.0,U,S,1,Mr
413,414,0.0,2,"Cunningham, Mr. Alfred Fleming",0,,0,0,239853,0.0,U,S,1,Mr
466,467,0.0,2,"Campbell, Mr. William",0,,0,0,239853,0.0,U,S,1,Mr
481,482,0.0,2,"Frost, Mr. Anthony Wood ""Archie""",0,,0,0,239854,0.0,U,S,1,Mr
597,598,0.0,3,"Johnson, Mr. Alfred",0,49.0,0,0,LINE,0.0,U,S,1,Mr
633,634,0.0,1,"Parr, Mr. William Henry Marsh",0,,0,0,112052,0.0,U,S,1,Mr


In [12]:
grouped = all_data.groupby(["Pclass", "Sex"])

def fFare(x):
    c = x.median()
    #res = x.replace(0, c) # some tickets have Fare=0 (mistake but could be free tickets? don't change), 
    #return res.fillna(c)
    return x.fillna(c)

all_data.Fare = grouped.Fare.apply(fFare)

## Age

Fill in missing values for age. By checking the mean age for the different titles we see different age distributions.

In [13]:
grouped2 = all_data.groupby(["Pclass", "Title"])
#grouped2.Age.std()
grouped2.Age.mean()

Pclass  Title       
1       Master           6.984000
        Miss            30.338983
        Mr              41.507519
        Mrs             42.385714
        Noble           39.600000
        Officer         54.714286
        Professional    45.600000
2       Master           2.757273
        Miss            20.717083
        Mr              32.346715
        Mrs             33.418182
        Professional    40.700000
3       Master           6.090000
        Miss            17.360874
        Mr              28.318910
        Mrs             32.326531
Name: Age, dtype: float64

 Fill in age values from by sampling a normal distribution with the same mean and sd for respective title and Pclass.

In [14]:
all_data.Age = grouped2.Age.apply(lambda x: x.fillna(max(0, np.random.normal(x.mean(), x.std()))))

In [15]:
all_data.Age.tail()

1304    12.673403
1305    39.000000
1306    38.500000
1307    12.673403
1308     4.932086
Name: Age, dtype: float64

## Ticket

 Ticket name (first letters) could be related to cabin class, and some Ticket names also to survival (e.g. tickets beginning with A have low chance of surviving). But this might be due to the correlation to cabin class or other. Since I don't know what the ticket names indicate (place of purchase?), skip feature. 


In [16]:
'''
grouped3 = all_data.groupby(["Ticket"])
def fTicket(x):
    print(x)
    if(len(x)<20):
        x.loc[:] = "OTHER"
        print(x)
        return x
    else:
        return x
    
all_data.Ticket = grouped3.Ticket.apply(fTicket)
pd.crosstab(all_data.Ticket, all_data['Pclass'], margins=True)
pd.crosstab(all_data.Ticket, all_data['Survived'], margins=True)
'''

'\ngrouped3 = all_data.groupby(["Ticket"])\ndef fTicket(x):\n    print(x)\n    if(len(x)<20):\n        x.loc[:] = "OTHER"\n        print(x)\n        return x\n    else:\n        return x\n    \nall_data.Ticket = grouped3.Ticket.apply(fTicket)\npd.crosstab(all_data.Ticket, all_data[\'Pclass\'], margins=True)\npd.crosstab(all_data.Ticket, all_data[\'Survived\'], margins=True)\n'

In [17]:
all_data.drop(["Ticket"], axis=1, inplace=True)

# Feature engineering

## Family onboard
Create variable for family size = siblings + parents. Then create variable indicating more than 4 family members on board (large_fam), and variable indicating no family on board (solo). Idea is that solo might have less help to survive. But too large a family might not fit on the life boats. Or something opposite where someone solo can easier find a free spot on a life boat, or many family members on board might help and increase survival. Could be a factor, we include it since it can have some signal for the final models for chance of survival.

In [18]:
all_data["fam_size"] = all_data.SibSp + all_data.Parch
all_data["solo"] = (all_data["fam_size"] == 0)*1
all_data["large_fam"] = (all_data["fam_size"] >= 4)*1

all_data.drop(["SibSp", "Parch"], axis=1, inplace=True)

## One-hot encoding of categorical variables

In [19]:
dummc = ["Pclass", "Cabin", "Embarked", "Title"]
#dummc = ["Title"]
all_data[dummc]

Unnamed: 0,Pclass,Cabin,Embarked,Title
0,3,U,S,Mr
1,1,C,C,Mrs
2,3,U,S,Miss
3,1,C,S,Mrs
4,3,U,S,Mr
...,...,...,...,...
1304,3,U,S,Mr
1305,1,C,C,Noble
1306,3,U,S,Mr
1307,3,U,S,Mr


In [20]:

dummy = pd.get_dummies(all_data[dummc], columns=dummc, drop_first=False)
all_data = pd.concat([all_data, dummy], axis=1) # add new columns
all_data.drop(dummc, axis=1, inplace=True) # remove old columns


In [21]:
all_data.drop(["Name"], axis=1, inplace=True) # Drop name column, not needed

In [22]:
all_data.info() # Now we have no missing data, except for 'Survived' which is what we want to predict in the test data.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   PassengerId         1309 non-null   int64  
 1   Survived            891 non-null    float64
 2   Sex                 1309 non-null   int64  
 3   Age                 1309 non-null   float64
 4   Fare                1309 non-null   float64
 5   is_train            1309 non-null   int64  
 6   fam_size            1309 non-null   int64  
 7   solo                1309 non-null   int64  
 8   large_fam           1309 non-null   int64  
 9   Pclass_1            1309 non-null   uint8  
 10  Pclass_2            1309 non-null   uint8  
 11  Pclass_3            1309 non-null   uint8  
 12  Cabin_A             1309 non-null   uint8  
 13  Cabin_B             1309 non-null   uint8  
 14  Cabin_C             1309 non-null   uint8  
 15  Cabin_D             1309 non-null   uint8  
 16  Cabin_

# Create models

## Extract training/test data

In [23]:
train_rows = all_data["is_train"] == 1

all_data_X = all_data.drop(["is_train", "PassengerId", "Survived"], axis=1)
all_data_Y = all_data["Survived"]

train_X = all_data_X[train_rows ]
test_X = all_data_X[ train_rows == 0 ]
test_pid = all_data.PassengerId[ train_rows == 0 ]

train_Y = all_data_Y[train_rows]

In [24]:
train_X.head()

Unnamed: 0,Sex,Age,Fare,fam_size,solo,large_fam,Pclass_1,Pclass_2,Pclass_3,Cabin_A,...,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Noble,Title_Officer,Title_Professional
0,0,22.0,7.25,1,0,0,0,0,1,0,...,0,0,1,0,0,1,0,0,0,0
1,1,38.0,71.2833,1,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0
2,1,26.0,7.925,0,1,0,0,0,1,0,...,0,0,1,0,1,0,0,0,0,0
3,1,35.0,53.1,1,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,0
4,0,35.0,8.05,0,1,0,0,0,1,0,...,0,0,1,0,0,1,0,0,0,0


In [25]:
test_X.head()

Unnamed: 0,Sex,Age,Fare,fam_size,solo,large_fam,Pclass_1,Pclass_2,Pclass_3,Cabin_A,...,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Noble,Title_Officer,Title_Professional
891,0,34.5,7.8292,0,1,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
892,1,47.0,7.0,1,0,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
893,0,62.0,9.6875,0,1,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,0
894,0,27.0,8.6625,0,1,0,0,0,1,0,...,0,0,1,0,0,1,0,0,0,0
895,1,22.0,12.2875,2,0,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0


## Random Forest Classifier

In [26]:
from sklearn.ensemble import RandomForestClassifier

rfcl = RandomForestClassifier(n_estimators=200, max_features=3, max_depth=2)
rfcl.fit(train_X,train_Y)

def f_rfcl(x): return rfcl.predict(x).astype(int)

print("Accuracy:", (train_Y == f_rfcl(train_X)).mean())


Accuracy: 0.8035914702581369


## XGBoost

In [27]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV


xgb_model = XGBClassifier(max_depth=1, verbosity=1, n_estimators=20)
xgb_model.fit(train_X,train_Y)

def f_xgb(x): return xgb_model.predict(x).astype(int)

print("Accuracy:", (train_Y == f_xgb(train_X)).mean())

Accuracy: 0.8226711560044894


## Logistic regression

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

## cols_X = list(train_X)
## train_X2 = train_X[['Sex', 'Age', 'Fare', 'fam_size', 'solo', 'large_fam', 'Pclass_2', 'Pclass_3', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F', 'Cabin_G', 'Cabin_T', 'Cabin_U', 'Embarked_Q', 'Embarked_S', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Noble', 'Title_Officer', 'Title_Professional']]
train_X2 = train_X


logreg=LogisticRegression(max_iter=1000, penalty='l2', C=0.02)

logreg.fit(train_X2, train_Y)

def f_logreg(x): return logreg.predict(x).astype(int)

print("Logistic regression coefficients:", logreg.coef_)

print("Accuracy:", (train_Y == f_logreg(train_X2)).mean())


Logistic regression coefficients: [[ 0.65505332 -0.01779128  0.00847377 -0.20156573 -0.18237865 -0.18148248
   0.22411051  0.14844985 -0.37266776  0.02615856  0.04858053 -0.02073633
   0.10083543  0.12822566  0.02894641 -0.00880327 -0.00607936 -0.29723502
   0.10555396  0.00277864 -0.10844001  0.15043076  0.25478281 -0.72879155
   0.38153986 -0.00551882 -0.0020222  -0.05052827]]
Accuracy: 0.8271604938271605


## Support vector machine

In [29]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

clf = make_pipeline(StandardScaler(), SVC(gamma='scale', kernel='rbf', C=0.07)) # C is regularization parameter
clf.fit(train_X, train_Y)

def f_svn(x): return(clf.predict(x).astype(int))

print("Accuracy:", (f_svn(train_X)==train_Y).mean())
##print(clf.get_params())

Accuracy: 0.8237934904601572


In [30]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(train_X, train_Y)

def f_knn(x): return( knn.predict(x).astype(int))

print("Accuracy:", (f_knn(train_X)==train_Y).mean())

Accuracy: 0.8462401795735129


## Combine model predictions

In [31]:
def f_multi(x):
    ans = np.around(
        (f_rfcl(x) + f_xgb(x) + f_logreg(x) +
         f_svn(x) + f_knn(x))/5.0).astype(int)
    return ans

print("Accuracy (multi):", (f_multi(train_X)==train_Y).mean())

test_predict = f_multi(test_X) # get predictions on test set

print("Survival rate in test set:", test_predict.mean())

# create data frame with passenger ids and prediction
output = pd.DataFrame({'PassengerId': test_pid, 'Survived': test_predict})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Accuracy (multi): 0.8406285072951739
Survival rate in test set: 0.3708133971291866
Your submission was successfully saved!


In [32]:
pd.read_csv('my_submission.csv')

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
