In [11]:
import pandas as pd
import numpy as np

np.random.seed(42)
data = pd.read_csv("ObesityDataSet_raw_and_data_sinthetic.csv")
data.describe()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.3126,1.701677,86.586058,2.419043,2.685628,2.008011,1.010298,0.657866
std,6.345968,0.093305,26.191172,0.533927,0.778039,0.612953,0.850592,0.608927
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,19.947192,1.63,65.473343,2.0,2.658738,1.584812,0.124505,0.0
50%,22.77789,1.700499,83.0,2.385502,3.0,2.0,1.0,0.62535
75%,26.0,1.768464,107.430682,3.0,3.0,2.47742,1.666678,1.0
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0


In [12]:
print(data.info())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [13]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

nominal_cat_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder())

In [14]:
from sklearn.base import BaseEstimator, TransformerMixin

class DynamicOrdinalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, categories='auto'):
        self.categories=categories
        self.encoder=OrdinalEncoder(categories=self.categories)

    def fit(self, X, y=None):
        self.encoder = OrdinalEncoder(categories=self.categories)
        self.encoder.fit(X)
        return self

    def set_params(self, **params):
        if 'categories' in params:
            self.categories = params['categories']
            self.encoder = OrdinalEncoder(categories=self.categories)
        return super().set_params(**params)

ordinal_cat_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'),
                                 OrdinalEncoder(categories=[
                                        ['no', 'Sometimes', 'Frequently', 'Always'],
                                        ['no', 'Sometimes', 'Frequently', 'Always']
                                 ]),
                                 StandardScaler())

num_pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())

In [15]:
from sklearn.compose import ColumnTransformer, make_column_selector
from copy import deepcopy

X, y = data.drop('NObeyesdad', axis=1), data['NObeyesdad']

preprocessing = ColumnTransformer([
    ('one_hot', nominal_cat_pipe, ['Gender', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC', 'MTRANS']),
    ('ordinal', ordinal_cat_pipe, ['CAEC', 'CALC']),
    ('num', num_pipe, make_column_selector(dtype_exclude='object'))
])

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
for i in range(10):
    print(y[i])

1
1
1
5
6
1
1
1
1
1


In [16]:
X = preprocessing.fit_transform(X)
X = pd.DataFrame(X, columns=preprocessing.get_feature_names_out())
print(X.head())

   one_hot__Gender_Female  one_hot__Gender_Male  \
0                     1.0                   0.0   
1                     1.0                   0.0   
2                     0.0                   1.0   
3                     0.0                   1.0   
4                     0.0                   1.0   

   one_hot__family_history_with_overweight_no  \
0                                         0.0   
1                                         0.0   
2                                         0.0   
3                                         1.0   
4                                         1.0   

   one_hot__family_history_with_overweight_yes  one_hot__FAVC_no  \
0                                          1.0               1.0   
1                                          1.0               1.0   
2                                          1.0               1.0   
3                                          0.0               1.0   
4                                          0.0            

In [17]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import log_loss

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


svc = SVC(probability=True, random_state=42)
random_forest = RandomForestClassifier(random_state=42)
log_reg = LogisticRegression(max_iter=1000, random_state=42)

svc.fit(X_train, y_train)
random_forest.fit(X_train, y_train)
log_reg.fit(X_train, y_train)

log_loss sprawdza jak dobre są przewidziane prawdopodobieństwa. Kara złe i zbyt pewne predykcje. Im mniej tym lepiej

In [18]:
print(log_loss(y_test, svc.predict_proba(X_test)))
print(log_loss(y_test, random_forest.predict_proba(X_test)))
print(log_loss(y_test, log_reg.predict_proba(X_test)))

0.2344422312180478
0.2839459917183126
0.4082647241118053


cross_val_score sprawdza celność, czyli ile procent przewidzianych labeli zgadza się z prawdziwymi

In [19]:
print(cross_val_score(SVC(random_state=42), X, y))
print(cross_val_score(RandomForestClassifier(random_state=42), X, y))
print(cross_val_score(LogisticRegression(max_iter=1000, random_state=42), X, y))

[0.70685579 0.93838863 0.92890995 0.94312796 0.95971564]
[0.72576832 0.97630332 0.98341232 0.98341232 0.98815166]
[0.70212766 0.91469194 0.89336493 0.91943128 0.91943128]


# Własne implementacje
Regresja liniowa z zamkniętą formułą

In [20]:
from sklearn.metrics import root_mean_squared_error

def closed_form_lin_reg(X, y):
    X = np.c_[np.ones((X.shape[0], 1)), X] # adding bias column
    pars = np.linalg.inv(X.T @ X) @ X.T @ y
    return pars

X_lin = data[['Height', 'Age']].to_numpy()
y_lin = data['Weight'].to_numpy()

X_lin_train, X_lin_test, y_lin_train, y_lin_test = train_test_split(X_lin, y_lin, test_size=0.2, random_state=42)

pars = closed_form_lin_reg(X_lin_train, y_lin_train)
print(pars)
y_lin_pred = np.c_[np.ones((X_lin_test.shape[0], 1)), X_lin_test] @ pars

print(root_mean_squared_error(y_lin_test, y_lin_pred))

[-156.62010215  130.68210655    0.85271068]
22.743273289591194


In [21]:
from sklearn.linear_model import LinearRegression

lin_reg_sklearn = LinearRegression()
lin_reg_sklearn.fit(X_lin_train, y_lin_train)

y_lin_pred = lin_reg_sklearn.predict(X_lin_test)
print(root_mean_squared_error(y_lin_test, y_lin_pred))

22.74327328959121


Moja implementacja - praktycznie identyczny RMSE do sklearnowej

Nieużywana bo zbyt wysoka złożoność, ponadto działa tylko dla liniowych związków

Regresja softmax, bo przewidujemy wiele klas

In [22]:
from sklearn.metrics import accuracy_score

def softmax(x):
    x_exp = np.exp(x - np.max(x, axis=1, keepdims=True)) # thanks to this we avoid too large exponentials, without changing the result of softmax
    return x_exp / np.sum(x_exp, axis=1, keepdims=True)

def cross_entropy_multiclass(y_true, y_pred):
    eps = 1e-15
    y_pred = np.clip(y_pred, eps, 1-eps) #avoiding log(0) and log(1)
    return -np.mean(np.sum(y_true * np.log(y_pred), axis=1))

def softmax_regression(X, y, n_classes=7, learning_rate=0.01, iters=1000, batch_size=20):
    X = np.c_[np.ones((X.shape[0], 1)), X] # adding bias column

    y_one_hot = np.zeros((X.shape[0], n_classes))
    y_one_hot[np.arange(X.shape[0]), y] = 1

    pars = np.zeros((X.shape[1], n_classes))

    for _ in range(iters):
        for i in range(0, X.shape[0], batch_size):
            X_batch = X[i : i+batch_size]
            y_one_hot_batch = y_one_hot[i : i+batch_size]

            logits = X_batch @ pars
            probs = softmax(logits)

            gradient = X_batch.T @ (probs - y_one_hot_batch) / X_batch.shape[0]
            pars -= learning_rate * gradient

    return pars

def predict_softmax(X, pars):
    X = np.c_[np.ones((X.shape[0], 1)), X]
    probs = softmax(X @ pars)
    return np.argmax(probs, axis=1), probs

pars = softmax_regression(X_train, y_train)

y_pred_labels, y_pred_probs = predict_softmax(X_test, pars)

print(accuracy_score(y_test, y_pred_labels))

y_test_one_hot = np.zeros((len(y_test), 7))
y_test_one_hot[np.arange(len(y_test)), y_test] = 1

print(cross_entropy_multiclass(y_test_one_hot, y_pred_probs))

0.8747044917257684
0.41063453476331024


Moja implementacja działa porównywalnie, a nawet trochę lepiej od sklearn

# Validation set

In [23]:
X_train_n, X_val, y_train_n, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

svc = SVC(probability=True, random_state=42)
random_forest = RandomForestClassifier(random_state=42)
log_reg = LogisticRegression(max_iter=1000, random_state=42)

svc.fit(X_train_n, y_train_n)
random_forest.fit(X_train_n, y_train_n)
log_reg.fit(X_train_n, y_train_n)

In [24]:
print(accuracy_score(y_val, svc.predict(X_val)))
print(accuracy_score(y_val, random_forest.predict(X_val)))
print(accuracy_score(y_val, log_reg.predict(X_val)))

0.8668639053254438
0.9437869822485208
0.8816568047337278


In [25]:
print(accuracy_score(y_test, svc.predict(X_test)))
print(accuracy_score(y_test, random_forest.predict(X_test)))
print(accuracy_score(y_test, log_reg.predict(X_test)))

0.91725768321513
0.9408983451536643
0.8723404255319149


# PyTorch

In [26]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.nn.functional as F

In [None]:
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)  # linear layer, bias included

    def forward(self, x):
        return self.linear(x)  # logits, softmax will be applied in the loss function

In [None]:
def train_logistic_regression(X, y, learning_rate=0.1, batch_size=32, n_epochs=100):
    # Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Convert numpy arrays to PyTorch tensors (float for X, long for y labels)
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.long)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.long)

    # Wrap tensors in a Dataset and load them in batches
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Get dimensions
    input_dim = X.shape[1]
    output_dim = len(np.unique(y))

    # Initialize model, loss, and optimizer
    model = LogisticRegressionModel(input_dim, output_dim)
    criterion = nn.CrossEntropyLoss()  # applies softmax internally
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(n_epochs):
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()              # reset gradients to zero
            outputs = model(X_batch)           # forward pass (logits)
            loss = criterion(outputs, y_batch) # compute cross-entropy loss
            loss.backward()                    # compute gradients
            optimizer.step()                   # update weights using gradients

    # Evaluate
    with torch.no_grad():  # disable gradient tracking during inference
        test_outputs = model(X_test_tensor)                 # get logits
        predicted_classes = torch.argmax(test_outputs, 1)   # convert logits to predicted labels
        accuracy = accuracy_score(y_test, predicted_classes.numpy())  # compute accuracy

    print(f"Test Accuracy: {accuracy:.4f}")
    return model
