## Titanic survivors
Logistic regression model using scikit-learn
<div stlye="text-align:center">
<img src="titanic.png" width=75%>
</div>

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot') or plt.style.use('ggplot')

## Data
Lest get the raw data, to see what is like

In [47]:
data = pd.read_csv('dataset/train.csv')
test_data = pd.read_csv('dataset/test.csv')

In [33]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Removing non important columns
There are columns that won't help the algorithm, so we are going to take those out

In [34]:
columns_to_drop = ['PassengerId','Name','Ticket']
df = data.drop(columns=columns_to_drop)
df


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.2500,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.9250,,S
3,1,1,female,35.0,1,0,53.1000,C123,S
4,0,3,male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,,S
887,1,1,female,19.0,0,0,30.0000,B42,S
888,0,3,female,,1,2,23.4500,,S
889,1,1,male,26.0,0,0,30.0000,C148,C


# Cleaning data
Let's check for null values

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Cabin     204 non-null    object 
 8   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


## One hot encoding
We are going to get our nominal data into one hot encoding

In [36]:
df = pd.get_dummies(df,prefix=['Cabin','Embarked'],columns=['Cabin','Embarked'])
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin_A10,Cabin_A14,Cabin_A16,...,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Embarked_C,Embarked_Q,Embarked_S
0,0,3,male,22.0,1,0,7.2500,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,1,female,38.0,1,0,71.2833,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,3,female,26.0,0,0,7.9250,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,1,female,35.0,1,0,53.1000,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,3,male,35.0,0,0,8.0500,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,0,0,0,...,0,0,0,0,0,0,0,0,0,1
887,1,1,female,19.0,0,0,30.0000,0,0,0,...,0,0,0,0,0,0,0,0,0,1
888,0,3,female,,1,2,23.4500,0,0,0,...,0,0,0,0,0,0,0,0,0,1
889,1,1,male,26.0,0,0,30.0000,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [37]:
data.Embarked.unique()


array(['S', 'C', 'Q', nan], dtype=object)

In [38]:
data[data.Embarked.isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [41]:
labelencoder = LabelEncoder()
df['Sex'] = labelencoder.fit_transform(df['Sex'])

df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin_A10,Cabin_A14,Cabin_A16,...,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,22.0,1,0,7.2500,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,1,0,38.0,1,0,71.2833,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,3,0,26.0,0,0,7.9250,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,1,0,35.0,1,0,53.1000,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,3,1,35.0,0,0,8.0500,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,0,0,0,...,0,0,0,0,0,0,0,0,0,1
887,1,1,0,19.0,0,0,30.0000,0,0,0,...,0,0,0,0,0,0,0,0,0,1
888,0,3,0,,1,2,23.4500,0,0,0,...,0,0,0,0,0,0,0,0,0,1
889,1,1,1,26.0,0,0,30.0000,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [42]:
X_train = df.drop(columns=['Survived'])
y_train = df['Survived']

In [43]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin_A10,Cabin_A14,Cabin_A16,Cabin_A19,...,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Embarked_C,Embarked_Q,Embarked_S
0,3,1,22.0,1,0,7.2500,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,0,38.0,1,0,71.2833,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,0,26.0,0,0,7.9250,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,0,35.0,1,0,53.1000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,3,1,35.0,0,0,8.0500,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,1,27.0,0,0,13.0000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
887,1,0,19.0,0,0,30.0000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
888,3,0,,1,2,23.4500,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
889,1,1,26.0,0,0,30.0000,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [46]:
model = LogisticRegression()

model.fit(X_train,y_train)

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values