## Imports

In [77]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import numpy as np
import autograd.numpy as npa
from autograd import grad
import random

## Data observation

In [78]:
data = pd.read_csv('/Users/artem/Documents/titanic.csv')
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [79]:
data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 315.0 KB


## Preprocessing

In [80]:
def embarked_changer(embarked):
    if embarked == 'C':
        embarked = 0
    elif embarked == 'Q':
        embarked = 1
    elif embarked == 'S':
        embarked = 2
    return embarked


def age_processing(survived, sex):
    dead_women_mean_age = int(data[(data['survived'] == 0) & (data['sex'] == 0)]['age'].mean())
    survived_women_mean_age = int(data[(data['survived'] == 1) & (data['sex'] == 0)]['age'].mean())
    dead_men_mean_age = int(data[(data['survived'] == 0) & (data['sex'] == 1)]['age'].mean())
    survived_men_mean_age = int(data[(data['survived'] == 1) & (data['sex'] == 1)]['age'].mean())
    
    if survived == 0 and sex == 0:
        age = dead_women_mean_age
    elif survived == 1 and sex == 0:
        age = survived_women_mean_age
    elif survived == 0 and sex == 1:
        age = dead_men_mean_age
    elif survived == 1 and sex == 1:
        age = survived_men_mean_age
    return age
    
def change_age(row):
    age = row['age']
    survived = row['survived']
    sex = row['sex']
    
    if np.isnan(age):
        age = age_processing(survived, sex)
    return age


def preprocessing(data: pd.DataFrame) -> pd.DataFrame:
    '''
    Preparing dataset for machine learning models.
    
    Parameters
    ----------
        data : pd.DataFrame
            Input dataframe containing titanic passangers data.
    
    Returns
    -------
        pd.DataFrame
            Processed dataframe.
    '''
    # rename columns to lowercase
    data.columns = [column.lower() for column in data.columns]
    
    # changing columns sex and embarked from string to int
    data['sex'] = data['sex'].apply(lambda x: 1 if x == 'male' else 0)
    data['embarked'] = data['embarked'].apply(embarked_changer)

    # dropping unnecessary columns 
    data = data.drop('ticket', axis=1)
    data = data.drop('cabin', axis=1)
    data = data.drop('passengerid', axis=1)
    data = data.drop('name', axis=1)
    
    # filling nan values of age
    data['age'] = data.apply(change_age, axis=1)
    
    # dropping nan values
    data = data.dropna()
    
    # changing columns datatypes
    data = data.astype(
        {
            'sex': np.uint8,
            'survived': np.uint8, 
            'pclass': np.uint8,
            'sibsp': np.uint8,
            'parch': np.uint8,
            'embarked': np.uint8
        }
    )
    
    return data
    

In [81]:
df = preprocessing(data=data)

In [82]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,1,22.0,1,0,7.2500,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.9250,2
3,1,1,0,35.0,1,0,53.1000,2
4,0,3,1,35.0,0,0,8.0500,2
...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,2
887,1,1,0,19.0,0,0,30.0000,2
888,0,3,0,25.0,1,2,23.4500,2
889,1,1,1,26.0,0,0,30.0000,0


In [83]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  889 non-null    uint8  
 1   pclass    889 non-null    uint8  
 2   sex       889 non-null    uint8  
 3   age       889 non-null    float64
 4   sibsp     889 non-null    uint8  
 5   parch     889 non-null    uint8  
 6   fare      889 non-null    float64
 7   embarked  889 non-null    uint8  
dtypes: float64(2), uint8(6)
memory usage: 26.0 KB


## Model Training

### Dataset preparation

In [84]:
X = df.drop('survived', axis=1)
y = df['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Logistic Regresion

In [85]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logistic_preds = logreg.predict(X_test)
sklearn.metrics.f1_score(y_test, logistic_preds)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7397260273972603

### Tree Classifier

In [86]:
tree_3 = DecisionTreeClassifier(max_depth=3)
tree_3.fit(X_train, y_train)
tree_3_preds = tree_3.predict(X_test)
sklearn.metrics.f1_score(y_test, tree_3_preds)

0.7714285714285714

### Random Forest

In [87]:
forest = RandomForestClassifier(n_estimators=500, max_depth=5)
forest.fit(X_train, y_train)
forest_preds = forest.predict(X_test)
sklearn.metrics.f1_score(y_test, forest_preds)

0.7480916030534351

### Support Vectors Classifier

In [88]:
svc = SVC()
svc.fit(X_train, y_train)
svc_preds = svc.predict(X_test)
sklearn.metrics.f1_score(y_test, svc_preds)

0.44000000000000006

### XGBoost

In [89]:
boosting = XGBClassifier(n_estimators=201, max_depth=3)
boosting.fit(X_train, y_train)
boosting_preds = boosting.predict(X_test)
sklearn.metrics.f1_score(y_test, boosting_preds)

0.7883211678832117

In [90]:
class LogReg:

    def __init__(self, lr = 0.001, penalty='l2', tol=1e-4, alpha=1.0, fit_intercept=True, max_iter=100, verbose=0, seed: int = 42):
        self.penalty = penalty
        self.tol = tol
        self.alpha = alpha
        self.fit_intercept = fit_intercept
        self.max_iter = max_iter
        self.verbose = verbose
        self.lr = lr
        self.seed = seed
        
        
    def fit(self, X, y):
        if not isinstance(X, pd.DataFrame) and not isinstance(X, np.ndarray):
            raise ValueError("X must be a pd.DataFrame or a numpy.ndarray")
        elif not isinstance(y, pd.DataFrame) and not isinstance(y, np.ndarray):
            raise ValueError("y must be a pd.DataFrame or a numpy.ndarray")
        
        rng = np.random.default_rng(self.seed)
        self.weights = npa.array([rng.normal() for _ in range(X.shape[1])])
        for sample, target in zip(X, y):
            self.target = target
            self.sample = npa.array(sample)
            x = sum(self.weights * sample)
            prediction = self.sigmoid(x)
            loss = self.log_loss(prediction, target)
            gradient = grad(self.logloss_fn)(self.weights)
            self.weights = self.weights - self.lr * gradient
        
    def sigmoid(self, x):
        if x >= 0:
            z = np.exp(-x)
            return 1 / (1 + z + 1e-2)
        else:
            z = np.exp(x)
            return z / (1 + z + 1e-2)
    
    def sigmoid_fn(self, x):
        return 1 / (1 + npa.exp(-x) + 1e-2)
    
    def reg_term(self, weights):
        if self.penalty == 'l1':
            return self.alpha * sum(abs(weights))
        return self.alpha * sum(self.weights ** 2)
    
    def logloss_fn(self, weights):
        z = npa.dot(weights, self.sample)
        
        if self.penalty == 'l1':
            return -(self.target * npa.log(self.sigmoid_fn(z)) + (1 - self.target) * npa.log(1 - self.sigmoid_fn(z))) + self.reg_term(weights)
        elif self.penalty == 'l2':
            return -(self.target * npa.log(self.sigmoid_fn(z)) + (1 - self.target) * npa.log(1 - self.sigmoid_fn(z))) + self.reg_term(weights)
        else:
            return -(self.target * npa.log(self.sigmoid_fn(z)) + (1 - self.target) * npa.log(1 - self.sigmoid_fn(z)))
        
    
    def log_loss(self, predicts, targets):
        loss = -(targets * npa.log(predicts) + (1 - targets) * npa.log(1 - predicts))
        return loss
        
    def predict(self, X_test):
        predictions = []
        for sample in X_test:
            x = sum(self.weights * sample)
            prediction = self.sigmoid(x)
            if prediction >= 0.5:
                predictions.append(1)
            else:
                predictions.append(0)
        return predictions
            
        

In [None]:
class LinearRegression:
    pass

In [91]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [101]:
seeds = np.linspace(30, 61)
scores = {}
for seed in range(1, 31):
    logreg = LogReg(penalty='None', lr=0.0001, seed=26)
    logreg.fit(np.array(X_train), np.array(y_train))
    predict = logreg.predict(np.array(X_test))
    scores[seed] = sklearn.metrics.accuracy_score(y_test, predict)

scores


{1: 0.2696629213483146,
 2: 0.651685393258427,
 3: 0.6629213483146067,
 4: 0.6573033707865169,
 5: 0.6404494382022472,
 6: 0.29213483146067415,
 7: 0.2303370786516854,
 8: 0.6123595505617978,
 9: 0.6404494382022472,
 10: 0.7134831460674157,
 11: 0.38202247191011235,
 12: 0.24719101123595505,
 13: 0.5449438202247191,
 14: 0.6460674157303371,
 15: 0.6853932584269663,
 16: 0.37640449438202245,
 17: 0.3539325842696629,
 18: 0.7247191011235955,
 19: 0.33707865168539325,
 20: 0.2696629213483146,
 21: 0.25280898876404495,
 22: 0.6797752808988764,
 23: 0.3202247191011236,
 24: 0.33707865168539325,
 25: 0.37640449438202245,
 26: 0.7303370786516854,
 27: 0.38764044943820225,
 28: 0.6573033707865169,
 29: 0.6179775280898876,
 30: 0.2752808988764045}

In [93]:
x = 5
y = 0
sigmoid = 1/(1+np.exp(-x))
loss = -(y * np.log(sigmoid) + (1 - y) * np.log(1 - sigmoid))
loss

5.006715348489137

In [94]:
import autograd.numpy as npa
from autograd import grad

def sigmoid_fn(x):
    return 1 / (1 + npa.exp(-x))

def logloss_fn(weights):
    z = npa.dot(weights, X)  # Dot product of weights and input data
    return -(y * npa.log(sigmoid_fn(z)) + (1 - y) * npa.log(1 - sigmoid_fn(z)))

# Define your input data X
X = npa.array([1, 2, 3])  # Example input data

y = 1
weights = npa.array([-1.5, 2.5, 0.3])

gradient = grad(logloss_fn)(weights)

print(gradient)

[-0.01212843 -0.02425687 -0.0363853 ]


f = $x^2$

f' = $2x$

grad f' at x = 2 is 4