**Importing Pandas, Numpy**

In [1]:


import numpy as np
import pandas as pd 


**Read The Files**

In [2]:
dtrain = pd.read_csv("../data/train.csv")
dtest = pd.read_csv("../data/test.csv")

**Cleaning Data:**

Dropping Tickets

In [3]:
dtrain.drop('Ticket',inplace = True,axis = 1)
dtest.drop('Ticket',inplace = True,axis = 1)
dtrain.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,,S


Replacing Males with 0s, Females with 1s to get numerical data.

In [4]:
dtrain['Sex'].replace(['male','female'],[0,1],inplace = True)
dtest['Sex'].replace(['male','female'],[0,1],inplace = True)
dtrain.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,,S


**Embarked is an action that is irrelevant to hitting the iceberg, as it tells from where the passengers
had gone on board. therefore we'll remove that from out calculations using drop.**

In [5]:
dtrain.drop('Embarked',inplace = True,axis = 1)
dtest.drop('Embarked',inplace = True,axis = 1)
dtrain.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,C85
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,


**Setting Passenger ID To be our index**

In [6]:
dtrain.set_index('PassengerId',inplace = True )
dtest.set_index('PassengerId',inplace = True )
dtrain.head(3)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,C85
3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,


**Dropping Fare, because I assume it doesn't really matter**

In [7]:
dtrain.drop('Fare',inplace=True,axis=1)
dtest.drop('Fare',inplace=True,axis=1)
dtest.head(3)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Cabin
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
892,3,"Kelly, Mr. James",0,34.5,0,0,
893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,
894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,


In [8]:
dtrain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    int64  
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Cabin     204 non-null    object 
dtypes: float64(1), int64(5), object(2)
memory usage: 62.6+ KB


In [9]:
dtest.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  418 non-null    int64  
 1   Name    418 non-null    object 
 2   Sex     418 non-null    int64  
 3   Age     332 non-null    float64
 4   SibSp   418 non-null    int64  
 5   Parch   418 non-null    int64  
 6   Cabin   91 non-null     object 
dtypes: float64(1), int64(4), object(2)
memory usage: 26.1+ KB


**Changing cabin from object to numerical (or preferably integers)**

In [10]:
dtrain['Cabin']=dtrain['Cabin'].astype(str)
dtest['Cabin']=dtest['Cabin'].astype(str)
dtrain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    int64  
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Cabin     891 non-null    object 
dtypes: float64(1), int64(5), object(2)
memory usage: 62.6+ KB


**Assuming that each letter in the Cabin's name corresponds to a specific zone inside the ship, then
that might be an important factor as the cabins which are close to the side which was drowning must have passengers
with lower survival rate
Therefore: extracting letters from cabins and dropping it after that**

In [11]:
def extract_letters(st, sst):
    for subs in sst:
        if str.find(st, subs) != -1:
            return subs
    return np.nan
zones = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'M']
dtrain['Zone'] = dtrain['Cabin'].map(lambda x: extract_letters(x, zones))
dtest['Zone'] = dtest['Cabin'].map(lambda x: extract_letters(x, zones))
dtrain.drop('Cabin',inplace=True,axis=1)
dtest.drop('Cabin',inplace=True,axis=1)

In [12]:
dtrain.head(3)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Zone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,C
3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,


**Fill missing values with ('M') for Missing values instead of keeping NaN values**

In [13]:
dtrain['Zone'] = dtrain['Zone'].fillna('M')
dtest['Zone'] = dtest['Zone'].fillna('M')

In [14]:
dtrain.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Zone          0
dtype: int64

In [15]:
dtest.isnull().sum()

Pclass     0
Name       0
Sex        0
Age       86
SibSp      0
Parch      0
Zone       0
dtype: int64

**Now only the age has NaN values, let's fix it using pandas.Dataframe.interpolate method**

In [16]:
dtrain['Age'].interpolate(method='linear',  limit_direction='both', inplace=True)
dtest['Age'].interpolate(method='linear',  limit_direction='both', inplace=True)

In [17]:
dtrain.isnull().sum()

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Zone        0
dtype: int64

In [18]:
dtest.isnull().sum()

Pclass    0
Name      0
Sex       0
Age       0
SibSp     0
Parch     0
Zone      0
dtype: int64

**dropping SibSp, Parch**

In [19]:
dtrain.drop('SibSp',inplace=True,axis=1)
dtrain.drop('Parch',inplace=True,axis=1)

In [20]:
dtest.drop('SibSp',inplace=True,axis=1)
dtest.drop('Parch',inplace=True,axis=1)

In [21]:
dtest.head(3)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,Zone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
892,3,"Kelly, Mr. James",0,34.5,M
893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,M
894,2,"Myles, Mr. Thomas Francis",0,62.0,M


**Now checking correlation**

In [22]:
dtrain['Pclass'].corr(dtrain['Survived'])
dtrain['Age'].corr(dtrain['Survived'])
dtrain['Sex'].corr(dtrain['Survived'])

0.5433513806577551

**Evaluation**

In [23]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(dtrain, test_size=0.3, random_state=42)
train.describe

<bound method NDFrame.describe of              Survived  Pclass                              Name  Sex   Age  \
PassengerId                                                                  
446                 1       1         Dodge, Master. Washington    0   4.0   
651                 0       3                 Mitkoff, Mr. Mito    0  20.5   
173                 1       3      Johnson, Miss. Eleanor Ileen    1   1.0   
451                 0       2             West, Mr. Edwy Arthur    0  36.0   
315                 0       2                Hart, Mr. Benjamin    0  43.0   
...               ...     ...                               ...  ...   ...   
107                 1       3  Salkjelsvik, Miss. Anna Kristine    1  21.0   
271                 0       1             Cairns, Mr. Alexander    0  30.0   
861                 0       3           Hansen, Mr. Claus Peter    0  41.0   
436                 1       1         Carter, Miss. Lucile Polk    1  14.0   
103                 0       1 

In [24]:
test.describe

<bound method NDFrame.describe of              Survived  Pclass  \
PassengerId                     
710                 1       3   
440                 0       2   
841                 0       3   
721                 1       2   
40                  1       3   
...               ...     ...   
822                 1       3   
634                 0       1   
457                 0       1   
501                 0       3   
431                 1       1   

                                                          Name  Sex   Age Zone  
PassengerId                                                                     
710          Moubarek, Master. Halim Gonios ("William George")    0  23.0    M  
440                     Kvillner, Mr. Johan Henrik Johannesson    0  31.0    M  
841                                Alhomaki, Mr. Ilmari Rudolf    0  20.0    M  
721                          Harper, Miss. Annie Jessie "Nina"    1   6.0    M  
40                                 Nicola-Yarred, 

In [25]:
from sklearn import linear_model

regr = linear_model.Lasso(alpha=0.01)
TrainingData = ["Pclass", "Zone" ,"Age", "Sex"]
Y = train['Survived']
X = pd.get_dummies(train[TrainingData])
X.drop('Zone_A', inplace=True, axis=1)
X.drop('Zone_T', inplace=True, axis=1)
X_test = pd.get_dummies(test[TrainingData])
regr.fit(X, Y)

Lasso(alpha=0.01)

In [26]:
X

Unnamed: 0_level_0,Pclass,Age,Sex,Zone_B,Zone_C,Zone_D,Zone_E,Zone_F,Zone_G,Zone_M
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
446,1,4.0,0,0,0,0,0,0,0,0
651,3,20.5,0,0,0,0,0,0,0,1
173,3,1.0,1,0,0,0,0,0,0,1
451,2,36.0,0,0,0,0,0,0,0,1
315,2,43.0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
107,3,21.0,1,0,0,0,0,0,0,1
271,1,30.0,0,0,0,0,0,0,0,1
861,3,41.0,0,0,0,0,0,0,0,1
436,1,14.0,1,1,0,0,0,0,0,0


In [27]:
X_test.drop("Zone_A",axis= 1 , inplace= True)

**Evaluation**

In [28]:

preds = regr.predict(X_test)
preds = np.round(preds).astype(int)
preds

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0])

In [29]:

from sklearn.metrics import mean_absolute_error

mean_absolute_error(test['Survived'], preds)

0.20522388059701493

**Training**

In [30]:
from sklearn import linear_model

regr = linear_model.Lasso(alpha=0.01)
Ytrain = dtrain['Survived']
Features = ["Pclass", "Zone" ,"Age", "Sex"]
Xtrain = pd.get_dummies(dtrain[Features])
Xtrain_test = pd.get_dummies(dtest[Features])
Xtrain.drop('Zone_T',inplace=True,axis=1)
regr.fit(Xtrain, Ytrain)

Lasso(alpha=0.01)

**Final Predictions**

In [31]:
Xtrain.head(7)

Unnamed: 0_level_0,Pclass,Age,Sex,Zone_A,Zone_B,Zone_C,Zone_D,Zone_E,Zone_F,Zone_G,Zone_M
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,3,22.0,0,0,0,0,0,0,0,0,1
2,1,38.0,1,0,0,1,0,0,0,0,0
3,3,26.0,1,0,0,0,0,0,0,0,1
4,1,35.0,1,0,0,1,0,0,0,0,0
5,3,35.0,0,0,0,0,0,0,0,0,1
6,3,44.5,0,0,0,0,0,0,0,0,1
7,1,54.0,0,0,0,0,0,1,0,0,0


In [32]:
Xtrain_test.head(20)

Unnamed: 0_level_0,Pclass,Age,Sex,Zone_A,Zone_B,Zone_C,Zone_D,Zone_E,Zone_F,Zone_G,Zone_M
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
892,3,34.5,0,0,0,0,0,0,0,0,1
893,3,47.0,1,0,0,0,0,0,0,0,1
894,2,62.0,0,0,0,0,0,0,0,0,1
895,3,27.0,0,0,0,0,0,0,0,0,1
896,3,22.0,1,0,0,0,0,0,0,0,1
897,3,14.0,0,0,0,0,0,0,0,0,1
898,3,30.0,1,0,0,0,0,0,0,0,1
899,2,26.0,0,0,0,0,0,0,0,0,1
900,3,18.0,1,0,0,0,0,0,0,0,1
901,3,21.0,0,0,0,0,0,0,0,0,1


In [33]:
Xtrain.head(3)

Unnamed: 0_level_0,Pclass,Age,Sex,Zone_A,Zone_B,Zone_C,Zone_D,Zone_E,Zone_F,Zone_G,Zone_M
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,3,22.0,0,0,0,0,0,0,0,0,1
2,1,38.0,1,0,0,1,0,0,0,0,0
3,3,26.0,1,0,0,0,0,0,0,0,1


In [34]:

preds = regr.predict(Xtrain_test)
preds = np.round(preds).astype(int)
preds

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [35]:

product = pd.DataFrame({'PassengerId': dtest.index, 'Survived': preds})
product.to_csv('submission.csv', index=False)
print("Submission saved")

Submission saved
