In [28]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

In [94]:
class MyBinaryLogisticRegression:
    def __init__(self, learning_rate=0.05, max_iter=1000000, eps = 0.0001):
        self.coefs_ = None
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.eps = eps

    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def _gradient(self, X, y, coefs):
        predictions = self._sigmoid(np.dot(X, coefs))
        gradient = np.dot(X.T, predictions - y) / len(y)
        return gradient

    def fit(self, X: pd.DataFrame, y: pd.DataFrame):
        self.feature_names_in_ = ['intercept'] + list(X.columns)
        X = np.hstack((np.ones((X.shape[0], 1)), X))
        self.coefs_ = np.random.randn(X.shape[1])

        for _ in range(self.max_iter):
            current_coefs = self.coefs_.copy()
            self.coefs_ -= self.learning_rate * self._gradient(X, y.values.reshape(-1), self.coefs_)
            if np.linalg.norm(current_coefs - self.coefs_) <= self.eps:
                break

    def predict_proba(self, X: pd.DataFrame):
        X = np.hstack((np.ones((X.shape[0], 1)), X))
        return self._sigmoid(np.dot(X, self.coefs_))

    def predict(self, X: pd.DataFrame):
        probabilities = self.predict_proba(X)
        return np.round(probabilities)

    def log_loss(self, X: pd.DataFrame, y: pd.DataFrame):
        y_proba = self.predict_proba(X)
        logloss_1 = np.sum(np.log(y_proba[y == 1] + 1e-30))
        logloss_0 = np.sum(np.log(1 - y_proba[y == 0] + 1e-30))
        logloss_total = -(logloss_0 + logloss_1) / len(y)
        return logloss_total

    def accuracy(self, X: pd.DataFrame, y: pd.DataFrame):
        y_proba = np.round(self.predict_proba(X))
        return np.sum(y_proba == y) / len(y)

    def score(self, X: pd.DataFrame, y: pd.DataFrame):
        return self.log_loss(X, y.values.reshape(-1)), self.accuracy(X, y.values.reshape(-1))

In [42]:
df = pd.read_csv('penguins_binary_classification.csv', sep=',')
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,2007
3,Adelie,Torgersen,36.7,19.3,193.0,3450.0,2007
4,Adelie,Torgersen,39.3,20.6,190.0,3650.0,2007
...,...,...,...,...,...,...,...
269,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,2009
270,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,2009
271,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,2009
272,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,2009


In [43]:
labelencoder_species = LabelEncoder()
df['species'] = labelencoder_species.fit_transform(df['species'])
labelencoder_island = LabelEncoder()
df['island'] = labelencoder_island.fit_transform(df['island'])

In [44]:
y = df[['species']]
X =df.drop(columns=['species'])

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [63]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [97]:
reg = MyBinaryLogisticRegression()
reg.fit(X_train, y_train)

score_train = reg.score(X_train, y_train)
score_test = reg.score(X_test, y_test)

print('Logloss')
print(f' train: {score_train[0]}')
print(f' test: {score_test[0]}')

print('Accuracy')
print(f' train: {score_train[1]}')
print(f' test: {score_test[1]}')

Logloss
 train: 0.007001402299432101
 test: 0.0068433896146424
Accuracy
 train: 1.0
 test: 1.0
