In [212]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [213]:
class MyLogisticRegression:
  def __init__(self, learning_rate=0.01, max_iter=1000, threshold=0.5):
    self.weights = None
    self.bias = None
    self.learning_rate = learning_rate
    self.max_iter = max_iter
    self.threshold = threshold

  def loss(self, y, hypothesis):
    m = len(y)
    epsilon = 1e-5
    j_theta = (-1 / m) * np.sum((y * np.log(hypothesis + epsilon)) + ((1 - y) * np.log(1 - hypothesis + epsilon)))
    return j_theta
  
  def sigmoid(self, z):
    return 1 / (1 + np.exp(-z))

  def hypothesis(self, X):
    return self.sigmoid(np.dot(X, self.weights) + self.bias)
  
  def fit(self, X, y):
    m, n = X.shape
    self.weights = np.zeros(n)
    self.bias = 0

    for _ in range(self.max_iter):
      # Forward Propagation
      y_hat = self.hypothesis(X)

      # Compute Average Loss
      avg_loss = self.loss(y, y_hat)

      # Backward Propagation
      derivative_theta = (1 / m) * np.dot(y_hat - y, X)
      derivative_bias = (1 / m) * np.sum(y_hat - y)
      self.weights -= self.learning_rate * derivative_theta
      self.bias -= self.learning_rate * derivative_bias

  def predict(self, X):
    return self.hypothesis(X) >= self.threshold
    


In [214]:
# Load the iris dataset

df = load_iris()
X = df.data
y = df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.7)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [215]:
# Hogwarts Dataset

df = pd.read_csv('../datasets/dataset_train.csv')

label_encoder = LabelEncoder()
df['target'] = label_encoder.fit_transform(df['Hogwarts House'])

numeric_features = df.select_dtypes(include=[np.number])
imp = SimpleImputer(strategy='mean')
numeric_features = pd.DataFrame(imp.fit_transform(numeric_features), columns=numeric_features.columns)

X = numeric_features.drop(columns=['target'])
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [216]:
LR = LogisticRegression(max_iter=1000)

LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)

accuracy_score(y_test, y_pred)

0.9875

In [217]:
MyLR = MyLogisticRegression(learning_rate=0.01, max_iter=1000, threshold=0.5)

MyLR.fit(X_train, y_train)
y_pred = MyLR.predict(X_test)

print(accuracy_score(y_test, y_pred))

0.375
