In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Import Datasets
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# combine Datasets
combined = pd.concat([train,test],ignore_index=True)
combined.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [5]:
# Check the missing values
combined.isnull().sum()[combined.isnull().sum()!=0]

Survived     418
Age          263
Fare           1
Cabin       1014
Embarked       2
dtype: int64

In [6]:
# list of Availabe Cabins
cabins = ['C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64', 'E24', 'C90', 'C45', 'E8', 'B101', 'D45', 'C46', 'D30',
       'E121', 'D11', 'E77', 'F38', 'B3', 'D6', 'B82 B84', 'D17', 'A36',
       'B102', 'B69', 'E49', 'C47', 'D28', 'E17', 'A24', 'C50', 'B42',
       'C148', 'B45', 'B36', 'A21', 'D34', 'A9', 'C31', 'B61', 'C53',
       'D43', 'C130', 'C132', 'C55 C57', 'C116', 'F', 'A29', 'C6', 'C28',
       'C51', 'C97', 'D22', 'B10', 'E45', 'E52', 'A11', 'B11', 'C80',
       'C89', 'F E46', 'B26', 'F E57', 'A18', 'E60', 'E39 E41',
       'B52 B54 B56', 'C39', 'B24', 'D40', 'D38', 'C105']

In [7]:
# Shapes of Train,test,combined sets
train.shape,test.shape,combined.shape

((891, 12), (418, 11), (1309, 12))

In [8]:
# define a function 
def cabin_labels(x):
    if x in cabins:
        return("Cabin Avbl")
    else:
        return("Missing")

In [9]:
# lets apply this on cabins column
combined["Cabin_cat"] = combined["Cabin"].apply(cabin_labels)

# NAME 

In [10]:
titles = []
for i in combined["Name"]:
    titles.append(i.split(", ")[1].split(". ")[0])

In [11]:
combined["Titles"]=pd.Series(titles)

In [12]:
combined["Titles"].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer', 'Dona'], dtype=object)

In [13]:
title_ignore = ['Don', 'Rev', 'Dr', 'Mme',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer', 'Dona']

def notitle(x):
    if x in title_ignore:
        return("Others")
    else:
        return(x)

In [14]:
combined["Titles"] = combined["Titles"].apply(notitle)

In [15]:
combined.drop(["PassengerId","Name","Cabin"],axis=1,inplace=True)

In [16]:
combined.drop("Ticket",axis=1,inplace=True)

# Family

In [17]:
combined["Family"] = combined["SibSp"]+combined["Parch"]+1

In [18]:
combined["Family"].unique()

array([ 2,  1,  5,  3,  7,  6,  4,  8, 11], dtype=int64)

In [19]:
def parivar(x):
    if x == 1:
        return("Solo")
    elif x==2:
        return("Couple")
    elif x<=4:
        return("Small")
    else:
        return("Large")

In [20]:
combined["Family_cat"] = combined["Family"].apply(parivar)

# MISSING VALUES :

In [21]:
missing_titles = combined.loc[combined["Age"].isnull(),"Titles"].unique()

In [22]:
for i in missing_titles:
    combined.loc[combined["Age"].isnull(),"Age"] = combined.loc[combined["Titles"]==i,"Age"].median()

In [23]:
combined.loc[combined["Embarked"].isnull(),"Embarked"] = combined["Embarked"].mode()[0]

In [24]:
combined.loc[combined["Fare"].isnull(),"Fare"] = combined["Fare"].median()

# Split the data in train and test Again:

In [25]:
new_train = combined.loc[0:train.shape[0]-1]
new_test = combined.loc[train.shape[0]:, ]
new_train.shape,new_test.shape

((891, 12), (418, 12))

In [26]:
X = new_train.drop("Survived",axis=1)
y = new_train["Survived"].astype(int)
new_test.drop("Survived",inplace=True,axis=1)

# MODEL BUILDING :

In [40]:
submission = pd.DataFrame({"PassengerId":test["PassengerId"],"Survived":0})
submission.to_csv("basemodel.csv",index=False) #62%

In [41]:
# dummify the data
newX = pd.get_dummies(X,drop_first=True)
newtest = pd.get_dummies(new_test,drop_first=True)

# Apply Logistic Regression :

In [42]:
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()
pred = lg.fit(newX,y).predict(newtest)

In [59]:
submission = pd.DataFrame({"PassengerId":test["PassengerId"],"Survived":pred})
submission.to_csv("Logistic_titanic.csv",index=False) # 76%