In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
training_data = pd.read_csv("/kaggle/input/titanic/train.csv")
training_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
print('The number of samples into the training data is {}.'.format(training_data.shape[0]))
print('There are {} categories.'.format(training_data.shape[1]))

The number of samples into the training data is 891.
There are 12 categories.


In [4]:
training_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

There is missing data for Age, Cabin, and Embarked.

In [5]:
print('Missing "Age" Percent is %.2f%%' %((training_data['Age'].isnull().sum()/training_data.shape[0])*100))
print('Missing "Cabin" Percent is %.2f%%' %((training_data['Cabin'].isnull().sum()/training_data.shape[0])*100))
print('Missing "Embarked" Percent is %.2f%%' %((training_data['Embarked'].isnull().sum()/training_data.shape[0])*100))

Missing "Age" Percent is 19.87%
Missing "Cabin" Percent is 77.10%
Missing "Embarked" Percent is 0.22%


Although there is no established cutoff from the literature regarding an acceptable percentage of missing data, Schafer ( 1999 ) asserted that a missing rate of 5% or less is inconsequential. Therefore Age and Cabin should be dropped and embarked can be filled.

In [6]:
training_data.drop('Cabin', axis=1, inplace=True)
training_data["Age"].fillna(training_data["Age"].median(skipna=True), inplace=True)
training_data["Embarked"].fillna(training_data['Embarked'].value_counts().idxmax(), inplace=True)

Cabin and Age are dropped from the dataset and the 2 missing data in embarked are filled with the most common boarding location.

In [7]:
training_data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

No more missing values in data

In [8]:
training_data['Alone']=np.where((training_data["SibSp"]+training_data["Parch"])>0, 0, 1)
training_data.drop('SibSp', axis=1, inplace=True)
training_data.drop('Parch', axis=1, inplace=True)

In [9]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

In [10]:
test_data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [11]:
test_data.drop('Cabin', axis=1, inplace=True)
test_data["Age"].fillna(training_data["Age"].median(skipna=True), inplace=True)
test_data["Embarked"].fillna(training_data['Embarked'].value_counts().idxmax(), inplace=True)

test_data['Alone']=np.where((test_data["SibSp"]+test_data["Parch"])>0, 0, 1)
test_data.drop('SibSp', axis=1, inplace=True)
test_data.drop('Parch', axis=1, inplace=True)

Similar adjustments for test data

In [12]:
test_data.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
Ticket         0
Fare           1
Embarked       0
Alone          0
dtype: int64

Fare has 1 missing value in test data so we can fill it with a median as it is numerical data.

In [13]:
test_data["Fare"].fillna(test_data["Fare"].median(skipna=True), inplace=True)

In [14]:
test_data.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
Ticket         0
Fare           0
Embarked       0
Alone          0
dtype: int64

In [15]:
training = pd.get_dummies(training_data, columns=["Pclass","Embarked","Sex"])
training.drop('Sex_female', axis=1, inplace=True)
training.drop('PassengerId', axis=1, inplace=True)
training.drop('Name', axis=1, inplace=True)
training.drop('Ticket', axis=1, inplace=True)

final_train = training
final_train.head()

Unnamed: 0,Survived,Age,Fare,Alone,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Sex_male
0,0,22.0,7.25,0,0,0,1,0,0,1,1
1,1,38.0,71.2833,0,1,0,0,1,0,0,0
2,1,26.0,7.925,1,0,0,1,0,0,1,0
3,1,35.0,53.1,0,1,0,0,0,0,1,0
4,0,35.0,8.05,1,0,0,1,0,0,1,1


In [16]:
testing = pd.get_dummies(test_data, columns=["Pclass","Embarked","Sex"])
testing.drop('Sex_female', axis=1, inplace=True)
testing.drop('PassengerId', axis=1, inplace=True)
testing.drop('Name', axis=1, inplace=True)
testing.drop('Ticket', axis=1, inplace=True)

final_test = testing
final_test.head()

Unnamed: 0,Age,Fare,Alone,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Sex_male
0,34.5,7.8292,1,0,0,1,0,1,0,1
1,47.0,7.0,0,0,0,1,0,0,1,0
2,62.0,9.6875,1,0,1,0,0,1,0,1
3,27.0,8.6625,1,0,0,1,0,0,1,1
4,22.0,12.2875,0,0,0,1,0,0,1,0


In [17]:
final_train['Minor']=np.where(final_train['Age']<=16, 1, 0)

final_test['Minor']=np.where(final_test['Age']<=16, 1, 0)

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

cols = ["Age","Alone","Fare","Minor","Pclass_1","Pclass_2","Pclass_3","Embarked_C","Embarked_Q","Embarked_S","Sex_male"] 
X = final_train[cols]
y = final_train['Survived']
model = LogisticRegression(solver='liblinear').fit(X,y)

In [19]:

final_test['Survived'] = model.predict(final_test[cols])
final_test['PassengerId'] = test_data['PassengerId']

submission = final_test[['PassengerId','Survived']]

submission.to_csv("submission.csv", index=False)

submission.tail()

Unnamed: 0,PassengerId,Survived
413,1305,0
414,1306,1
415,1307,0
416,1308,0
417,1309,0
