#### Importing The Dependencies

In [164]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### Analyzing the dataset

In [165]:
data = pd.read_csv('./datasets/train.csv', index_col='PassengerId').reset_index(drop=True)

In [166]:
data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [167]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB


Columns Description:
- Survived: Whether the person survived or not
- Pclass: Passenger's class (1, 2, or 3)
- Name: Passenger's name
- Sex: Passenger's gender
- Age: Passenger's age
- SibSp: Number of siblings/spouses aboard
- Parch: Number of parents/childrens aboard
- Ticket: TIcket number
- Fare: Fare
- Cabin: Cabin
- Embarked: Port of embarkation

In [168]:
data.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


#### Transforming the dataa

There are 3 ports, C: Cherbourg, Q = Queenstown, S = Southampton

In [169]:
ports_test = pd.get_dummies(data['Embarked'], prefix="EmbarkedFrom").astype('int') # Convert categorical data into indicator
ports_test.head()

Unnamed: 0,EmbarkedFrom_C,EmbarkedFrom_Q,EmbarkedFrom_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [170]:
data = data.join(ports_test, how="right", on=None, validate="many_to_many") # Join the ports data
data = data.drop(['Embarked'], axis=1) # Drop the original column
data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,EmbarkedFrom_C,EmbarkedFrom_Q,EmbarkedFrom_S
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,0,0,1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,1,0,0
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,0,0,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,0,0,1
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,0,0,1


Transforming the sex data

In [171]:
data['Sex'] = data['Sex'].astype('string')
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})
data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,EmbarkedFrom_C,EmbarkedFrom_Q,EmbarkedFrom_S
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0,0,1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1,0,0
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0,0,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,0,0,1
4,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0,0,1


#### Making the model

Dropping unnecessary columns

In [172]:
data = data.drop(['Cabin', 'Ticket', 'Name'], axis=1)

In [173]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Survived        891 non-null    int64  
 1   Pclass          891 non-null    int64  
 2   Sex             891 non-null    int64  
 3   Age             714 non-null    float64
 4   SibSp           891 non-null    int64  
 5   Parch           891 non-null    int64  
 6   Fare            891 non-null    float64
 7   EmbarkedFrom_C  891 non-null    int32  
 8   EmbarkedFrom_Q  891 non-null    int32  
 9   EmbarkedFrom_S  891 non-null    int32  
dtypes: float64(2), int32(3), int64(5)
memory usage: 59.3 KB


Checking the missing values

In [174]:
data.isna().sum()

Survived            0
Pclass              0
Sex                 0
Age               177
SibSp               0
Parch               0
Fare                0
EmbarkedFrom_C      0
EmbarkedFrom_Q      0
EmbarkedFrom_S      0
dtype: int64

We're going to fill in the missing values of age with the mean of the ages

In [175]:
data['Age'] = data['Age'].fillna(data['Age'].mean())

In [176]:
data.isna().sum()

Survived          0
Pclass            0
Sex               0
Age               0
SibSp             0
Parch             0
Fare              0
EmbarkedFrom_C    0
EmbarkedFrom_Q    0
EmbarkedFrom_S    0
dtype: int64

Extracting the target from the features

In [177]:
y = np.asarray(list(data['Survived']))
X = np.asarray(list(data.drop(['Survived'], axis=1).values)) # Exclude the target from the features

In [178]:
def calculate_accuracy(y_true: np.ndarray, y_pred: np.ndarray):
    return np.sum(y_pred == y_true) / y_true.shape[0]

Let's make a heurastic predictions, to make sure that the model behave significantly better than a bold prediction

In [179]:
def make_heurastic_predictions(data: pd.DataFrame):
    predictions: list[int] = list()
    
    for _, passenger in data.iterrows():
        if passenger['Sex'] == 1:
            predictions.append(1)
        elif passenger['Age'] < 18 and passenger['Pclass'] == 1:
            predictions.append(1)
        else:
            predictions.append(0)
    
    return predictions

In [180]:
bold_predictions = make_heurastic_predictions(data)
bold_predictions_accuracy = calculate_accuracy(y, bold_predictions)
print(f"Baseline: {bold_predictions_accuracy}")

Baseline: 0.7912457912457912


Now, let's make the Logistic Regression model

In [181]:
class LogisticRegression:
    def __init__(self, learning_rate=0.001, n_iters=1000):
        self.lr = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None
        self.losses = []
         
    # Sigmoid method
    def _sigmoid(self, x: np.ndarray):
        return 1 / (1 + np.exp(-x))

    def compute_loss(self, y_true: np.ndarray, y_pred: np.ndarray):
        # Binary cross entropy
        epsilon = 1e-9
        y1 = y_true * np.log(y_pred + epsilon)
        y2 = (1-y_true) * np.log(1 - y_pred + epsilon)
        return -np.mean(y1 + y2)

    def feed_forward(self,X: np.ndarray):
        z = np.dot(X, self.weights) + self.bias
        A = self._sigmoid(z)
        return A

    def fit(self, X: np.ndarray, y: np.ndarray):
        n_samples, n_features = X.shape

        # init parameters
        self.weights = np.zeros(n_features)
        self.bias = 0

        # gradient descent
        for _ in range(self.n_iters):
            A = self.feed_forward(X)
            self.losses.append(self.compute_loss(y,A))
            
            dz = A - y # Derivative of sigmoid and bce X.T*(A-y)
            
            # Calculate gradients
            dw = (1 / n_samples) * np.dot(X.T, dz)
            db = (1 / n_samples) * np.sum(dz)
            
            # Update parameters
            self.weights -= self.lr * dw
            self.bias -= self.lr * db
            
    def predict(self, X: np.ndarray):
        threshold = .5
        y_hat = np.dot(X, self.weights) + self.bias
        y_predicted = self._sigmoid(y_hat)
        y_predicted_cls = [1 if i > threshold else 0 for i in y_predicted]
        
        return np.array(y_predicted_cls)

In [182]:
model = LogisticRegression(learning_rate=.0001, n_iters=1000)
model.fit(X, y)

Making predictions

In [183]:
train_predictions = model.predict(X)

Testing the model

In [184]:
data_test = pd.read_csv('./datasets/test.csv')
data_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [185]:
ports_test = pd.get_dummies(data_test['Embarked'], prefix="EmbarkedFrom").astype('int') # Convert categorical data into indicator
ports_test.head()

Unnamed: 0,EmbarkedFrom_C,EmbarkedFrom_Q,EmbarkedFrom_S
0,0,1,0
1,0,0,1
2,0,1,0
3,0,0,1
4,0,0,1


In [186]:
data_test = data_test.join(ports_test, how="right", on=None, validate="many_to_many")
data_test = data_test.drop(['Embarked'], axis=1)
data_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,EmbarkedFrom_C,EmbarkedFrom_Q,EmbarkedFrom_S
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,0,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,0,0,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,0,1,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,0,0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,0,0,1


In [187]:
data_test['Sex'] = data_test['Sex'].astype('string')
data_test['Sex'] = data_test['Sex'].map({'male': 0, 'female': 1})
data_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,EmbarkedFrom_C,EmbarkedFrom_Q,EmbarkedFrom_S
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,0,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,0,0,1
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,0,1,0
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,0,0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,0,0,1


In [188]:
data_test = data_test.drop(['Cabin', 'Ticket', 'Name', 'PassengerId'], axis=1)

In [189]:
X_test = np.asarray(list(data_test.values)) # Exclude the target from the featuress

In [190]:
test_predictions = model.predict(X_test)

In [191]:
test_predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,