In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
import matplotlib.pyplot as plt

In [5]:
df = pd.read_csv('../assets/datasets/dataset_train.csv')

In [14]:
import pandas as pd



def fill_nan_with_manual_trimmed_mean(df, house_col, trim_frac=0.1):

    feature_cols = df.select_dtypes(include=['number']).columns
    for feature in feature_cols:
        grouped = df.groupby(house_col)
        for house, group in grouped:
            values = group[feature].dropna().values
            
            sorted_values = np.sort(values)
            
            trim_count = int(len(sorted_values) * trim_frac)
            
            if len(sorted_values) > 2 * trim_count:
                trimmed_values = sorted_values[trim_count:-trim_count]
            else:
                trimmed_values = sorted_values
                
            trimmed_mean = np.mean(trimmed_values) if len(trimmed_values) > 0 else np.nan
            
            df.loc[(df[house_col] == house) & (df[feature].isna()), feature] = trimmed_mean
    return df

df = fill_nan_with_manual_trimmed_mean(df, house_col='Hogwarts House', feature_cols=feature_columns)

print(df)




       Index Hogwarts House First Name    Last Name    Birthday Best Hand  \
0        0.0      Ravenclaw     Tamara          Hsu  2000-03-30      Left   
1        1.0      Slytherin      Erich      Paredes  1999-10-14     Right   
2        2.0      Ravenclaw   Stephany        Braun  1999-11-03      Left   
3        3.0     Gryffindor      Vesta    Mcmichael  2000-08-19      Left   
4        4.0     Gryffindor     Gaston        Gibbs  1998-09-27      Left   
...      ...            ...        ...          ...         ...       ...   
1595  1595.0     Gryffindor       Jung        Blank  2001-09-14     Right   
1596  1596.0      Slytherin     Shelli         Lock  1998-03-12      Left   
1597  1597.0     Gryffindor   Benjamin  Christensen  1999-10-24     Right   
1598  1598.0     Hufflepuff  Charlotte       Dillon  2001-09-21      Left   
1599  1599.0     Hufflepuff      Kylie        Nowak  2000-08-21      Left   

      Arithmancy   Astronomy  Herbology  Defense Against the Dark Arts  \
0

  df.loc[(df[house_col] == house) & (df[feature].isna()), feature] = trimmed_mean


In [112]:
df.head()

Unnamed: 0,Index,Hogwarts House,First Name,Last Name,Birthday,Best Hand,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
0,0,Ravenclaw,Tamara,Hsu,2000-03-30,Left,58384.0,-487.886086,5.72718,4.878861,4.722,272.035831,532.484226,5.231058,1039.788281,3.790369,0.715939,-232.79405,-26.89
1,1,Slytherin,Erich,Paredes,1999-10-14,Right,67239.0,-552.060507,-5.987446,5.520605,-5.612,-487.340557,367.760303,4.10717,1058.944592,7.248742,0.091674,-252.18425,-113.45
2,2,Ravenclaw,Stephany,Braun,1999-11-03,Left,23702.0,-366.076117,7.725017,3.660761,6.14,664.893521,602.585284,3.555579,1088.088348,8.728531,-0.515327,-227.34265,30.42
3,3,Gryffindor,Vesta,Mcmichael,2000-08-19,Left,32667.0,697.742809,-6.497214,-6.977428,4.026,-537.001128,523.982133,-4.809637,920.391449,0.821911,-0.01404,-256.84675,200.64
4,4,Gryffindor,Gaston,Gibbs,1998-09-27,Left,60158.0,436.775204,-7.820623,,2.236,-444.262537,599.324514,-3.444377,937.434724,4.311066,-0.26407,-256.3873,157.98


In [113]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

class LogisticRegression():
    def __init__(self, lr=0.1, n_iters=1000):
        self.lr = lr
        self.n_iters = n_iters
        self.weights = None
        self.bias = None
        self.X_min = None
        self.X_max = None
    
    def min_max_scale(self, X):
        """Scale the dataset using Min-Max scaling."""
        self.X_min = np.min(X, axis=0)
        self.X_max = np.max(X, axis=0)
        return (X - self.X_min) / (self.X_max - self.X_min + 1e-8)  # Adding epsilon to avoid division by zero < - sikintiii
    
    def fit(self, X, y):
        X = self.min_max_scale(X)
        
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        # Gradient descent
        for _ in range(self.n_iters):
            linear_predictions = np.dot(X, self.weights) + self.bias
            predictions = sigmoid(linear_predictions)
            
            dw = (1/n_samples) * np.dot(X.T, (predictions - y))
            db = (1/n_samples) * np.sum(predictions - y)
            self.weights = self.weights - self.lr * dw
            self.bias = self.bias - self.lr * db
    
    def predict(self, X):
        X = (X - self.X_min) / (self.X_max - self.X_min + 1e-8)
        
        linear_predictions = np.dot(X, self.weights) + self.bias
        y_pred = sigmoid(linear_predictions)
        class_pred = [0 if y <= 0.5 else 1 for y in y_pred]
        return class_pred


In [114]:
def accuracy(y_pred, y_test):
    return np.sum(y_pred==y_test)/ len(y_test)

In [10]:
def get_name(x):
    if x == 'Gryffindor':
        return 0
    elif x == 'Slytherin':
        return 1
    elif x == 'Ravenclaw':
        return 2
    else:
        return 3


df['big_y'] = df['Hogwarts House'].apply(get_name)

In [116]:
df.head()

Unnamed: 0,Index,Hogwarts House,First Name,Last Name,Birthday,Best Hand,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying,big_y
0,0,Ravenclaw,Tamara,Hsu,2000-03-30,Left,58384.0,-487.886086,5.72718,4.878861,4.722,272.035831,532.484226,5.231058,1039.788281,3.790369,0.715939,-232.79405,-26.89,2
1,1,Slytherin,Erich,Paredes,1999-10-14,Right,67239.0,-552.060507,-5.987446,5.520605,-5.612,-487.340557,367.760303,4.10717,1058.944592,7.248742,0.091674,-252.18425,-113.45,1
2,2,Ravenclaw,Stephany,Braun,1999-11-03,Left,23702.0,-366.076117,7.725017,3.660761,6.14,664.893521,602.585284,3.555579,1088.088348,8.728531,-0.515327,-227.34265,30.42,2
3,3,Gryffindor,Vesta,Mcmichael,2000-08-19,Left,32667.0,697.742809,-6.497214,-6.977428,4.026,-537.001128,523.982133,-4.809637,920.391449,0.821911,-0.01404,-256.84675,200.64,0
4,4,Gryffindor,Gaston,Gibbs,1998-09-27,Left,60158.0,436.775204,-7.820623,,2.236,-444.262537,599.324514,-3.444377,937.434724,4.311066,-0.26407,-256.3873,157.98,0


In [117]:
df = df.replace(np.nan, 0)

In [8]:
def returnX_y(df):
    y = df['big_y']
    X = df[['Arithmancy', 'Astronomy', 'Herbology', 'Defense Against the Dark Arts', 'Divination', 'Muggle Studies', 'Ancient Runes', 'History of Magic', 'Transfiguration', 'Potions','Care of Magical Creatures', 'Charms', 'Flying']]
    return X, y

Test to see if it's Grifindor or not.

In [119]:
X, y = returnX_y(df[(df['big_y'] == 0) | (df['big_y'] == 1)])

In [120]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [121]:
clf = LogisticRegression(lr=0.01, n_iters=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [122]:
acc = accuracy(y_pred, y_test)
print(acc)

1.0


In [123]:
print(y_pred, y_test)

[0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0] 1458    0
1492    0
1397    0
540     1
1212    1
       ..
1348    0
816     1
551     1
870     1
948     0
Name: big_y, Length: 126, dtype: int64


In [11]:
X, y = returnX_y(df)

df_filled = fill_trimmed_mean_for_all(X, 'Hogwarts House', trim_fraction)

KeyError: 'Hogwarts House'

In [125]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [126]:
def min_max_normalize(X, feature_range=(0, 1)):
    """
        min max normalizer klasik 
    """
    X_min = np.min(X, axis=0)
    X_max = np.max(X, axis=0)
    X_std = (X - X_min) / (X_max - X_min)
    X_scaled = X_std * (feature_range[1] - feature_range[0]) + feature_range[0]
    return X_scaled, X_min, X_max

In [127]:

class OneVsRestClassifier:
    def __init__(self, base_classifier, **kwargs):
        self.base_classifier = base_classifier
        self.kwargs = kwargs
        self.models = []
        self.X_min = None
        self.X_max = None
    
    def fit(self, X, y):
        # Normalize features
        X, self.X_min, self.X_max = min_max_normalize(X)
        
        self.classes = np.unique(y)
        self.models = []
        
        for c in self.classes:
            y_binary = np.where(y == c, 1, 0)
            model = self.base_classifier(**self.kwargs)
            model.fit(X, y_binary)
            self.models.append(model)
    
    def predict(self, X):
        # Normalize using training min and max
        X = (X - self.X_min) / (self.X_max - self.X_min)
        
        predictions = []
        for model in self.models:
            linear_predictions = np.dot(X, model.weights) + model.bias
            predictions.append(sigmoid(linear_predictions))
        
        predictions = np.array(predictions).T
        class_preds = np.argmax(predictions, axis=1)
        return self.classes[class_preds]

In [128]:

ovr = OneVsRestClassifier(LogisticRegression, lr=0.1, n_iters=1000)
ovr.fit(X_train, y_train)
y_pred = ovr.predict(X_test)

print("Predicted labels for test data:", y_pred)


Predicted labels for test data: [0 3 1 2 3 3 3 2 0 1 3 2 0 2 2 3 3 1 3 0 1 3 1 3 3 1 3 3 3 0 2 0 0 3 1 0 3
 2 3 2 2 3 2 3 2 2 3 3 2 0 2 2 3 2 3 1 0 3 2 2 0 3 3 2 2 0 2 2 0 0 1 0 3 2
 3 2 0 3 0 2 3 3 3 3 2 3 2 3 1 2 0 2 0 0 2 3 1 0 3 2 3 3 3 3 1 1 1 3 2 0 0
 3 3 3 2 2 3 1 3 2 1 2 2 2 2 0 0 2 1 2 1 2 2 0 0 3 3 3 1 1 3 1 2 1 2 1 2 2
 3 0 3 1 2 1 3 2 2 1 0 0 1 2 2 2 1 3 1 2 1 2 1 3 3 2 3 2 3 3 0 3 2 3 2 2 0
 3 0 3 1 2 2 1 0 2 1 3 0 1 0 0 3 3 0 2 1 2 3 3 2 0 2 2 0 3 0 2 2 3 2 2 3 0
 1 3 1 3 1 2 1 2 0 3 0 1 2 1 0 2 2 3 1 3 0 0 3 3 3 3 3 1 3 1 0 3 0 2 0 3 2
 0 2 3 2 0 3 3 1 3 1 0 1 1 3 0 0 2 1 1 3 3 2 3 3 3 0 2 3 0 3 2 1 2 1 0 1 2
 2 1 2 3 3 2 3 0 1 3 2 1 3 0 0 0 2 2 2 1 3 3 0 3]


In [129]:
acc = accuracy(y_pred, y_test)
acc

np.float64(0.9875)

In [130]:
features = df[['Arithmancy', 'Astronomy', 'Herbology', 'Defense Against the Dark Arts', 'Divination', 'Muggle Studies', 'Ancient Runes', 'History of Magic', 'Transfiguration', 'Potions','Care of Magical Creatures', 'Charms', 'Flying']]

features.head(1)

Unnamed: 0,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
0,58384.0,-487.886086,5.72718,4.878861,4.722,272.035831,532.484226,5.231058,1039.788281,3.790369,0.715939,-232.79405,-26.89


In [131]:
new_y_pred = ovr.predict(features.head(1))
print(new_y_pred)

[2]


In [132]:
df.head(1)

Unnamed: 0,Index,Hogwarts House,First Name,Last Name,Birthday,Best Hand,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying,big_y
0,0,Ravenclaw,Tamara,Hsu,2000-03-30,Left,58384.0,-487.886086,5.72718,4.878861,4.722,272.035831,532.484226,5.231058,1039.788281,3.790369,0.715939,-232.79405,-26.89,2
