In [27]:
import numpy as np
import pandas as pd
import networkx as nx
import os
from collections import Counter
import random

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error

from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler


In [24]:
raw_file = "./Hagelloch_df.csv"

In [15]:
def read_file(raw_file):
    df = pd.read_csv(raw_file)
    return df

Functions for discrete variables with minmax normalization of results

In [16]:
def discrete(df, column_name):

    if column_name == "C":
        serial_lst = (df[column_name] == "no complicatons").astype(int).tolist()
        return serial_lst


    if df[column_name].isna().any():
        serial_withoutNA = df[column_name].fillna(0)
        serial_num = pd.factorize(serial_withoutNA)[0].reshape(-1, 1)
    else:
        serial_num = pd.factorize(df[column_name])[0].reshape(-1, 1)
    scaler = MinMaxScaler()
    res = scaler.fit_transform(serial_num)
    serial_lst = res.flatten().tolist()
    return serial_lst

Functions for working with continuous variables and standardizing the results by standard deviation.

In [17]:
def countinues(df, column_name, return_scaler=False):
    if df[column_name].isna().any():
        if column_name == "IFTO":
            data_list = df.groupby("NAME")[column_name].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else x)).tolist()
    else:
        data_list = df[column_name].tolist()
    data_array = np.array(data_list).reshape(-1, 1)
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data_array)
    scaled_lst = scaled_data.flatten().tolist()
    if return_scaler:
        return scaled_lst, scaler
    return scaled_lst

Collate all the features and return an ndarray

In [18]:
def abstract_data(df):
    columns = ["FN", "HN", "AGE", "SEX", "CL", "IFTO", "SI", "C" ,"CA", "NI", "GE"] ## missing TD TM

    ## Processing of each column
    # FN
    # FN_lst = discrete(df, "FN")
    # print(FN_raw)
    # HN
    HN_lst = discrete(df, "HN")
    # AGE
    age_lst = countinues(df, "AGE")
    # SEX
    SEX_lst = discrete(df, "SEX")
    # Class
    CL_lst = discrete(df, "CL")
    # IFTO
    # IFTO_lst = countinues(df, "IFTO")
    # SI
    SI_lst = countinues(df, "SI")
    # C
    C_lst = discrete(df, "C")
    # CA
    CA_lst = countinues(df, "CA")
    # NI
    NI_lst = countinues(df, "NI")
    # GE
    GE_lst = countinues(df, "GE")

    # TD
    # TM
    clean_data = np.array([HN_lst, age_lst, SEX_lst, CL_lst, SI_lst, C_lst, CA_lst, NI_lst, GE_lst]).T
    return clean_data

In [19]:
def abstract_label(df):
    label_column = ["tPRO", "tERU"]

    tPRO_lst, tPRO_scaler = countinues(df, "tPRO", return_scaler=True)
    tERU_lst, tERU_scaler = countinues(df, "tERU", return_scaler=True)

    label = np.array([tPRO_lst, tERU_lst]).T
    return label, [tPRO_scaler, tERU_scaler]

In [25]:
## Data
df = read_file(raw_file)
data = abstract_data(df)

## Label
df = read_file(raw_file)
label, scaler_lst = abstract_label(df)

X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2,random_state=42)

Compute mse, rmse, mae medae for random forests

In [45]:
model = RandomForestRegressor(n_estimators=10)
model.fit(X_train, y_train)

predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, predictions)
medae = median_absolute_error(y_test, predictions)


print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Absolute Error: {mae}")
print(f"Median Absolute Error: {medae}")

Mean Squared Error: 0.5557117660120794
Root Mean Squared Error: 0.7454607742947172
Mean Absolute Error: 0.5616729417946513
Median Absolute Error: 0.45570338815729255


Compute mse, rmse, mae medae of LinearRegression

In [32]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

model.fit(X_train, y_train)

predictions_lr = model.predict(X_test)
mse_lr = mean_squared_error(y_test, predictions_lr)
rmse_lr = np.sqrt(mse_lr)
mae_lr = mean_absolute_error(y_test, predictions_lr)
medae_lr = median_absolute_error(y_test, predictions_lr)

print(f"Mean Squared Error: {mse_lr}")
print(f"Root Mean Squared Error: {rmse_lr}")
print(f"Mean Absolute Error: {mae_lr}")
print(f"Median Absolute Error: {medae_lr}")

Mean Squared Error: 0.7191856187421316
Root Mean Squared Error: 0.8480481228928767
Mean Absolute Error: 0.6806627806671744
Median Absolute Error: 0.5675477614442709


Compute mse, rmse, mae medae of SVMs

In [35]:
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor

svr = SVR(kernel='rbf')
model = MultiOutputRegressor(svr)

model.fit(X_train, y_train)

predictions_svr = model.predict(X_test)
mse_svr = mean_squared_error(y_test, predictions_svr)
rmse_svr = np.sqrt(mse_svr)
mae_svr = mean_absolute_error(y_test, predictions_svr)
medae_svr = median_absolute_error(y_test, predictions_svr)

print(f"Mean Squared Error: {mse_svr}")
print(f"Root Mean Squared Error: {rmse_svr}")
print(f"Mean Absolute Error: {mae_svr}")
print(f"Median Absolute Error: {medae_svr}")

Mean Squared Error: 0.8103374215685414
Root Mean Squared Error: 0.9001874369088592
Mean Absolute Error: 0.6924446131986255
Median Absolute Error: 0.49517184689761823


Calculating mse, rmse, mae medae for MLPs

In [38]:
from sklearn.neural_network import MLPRegressor
model = MLPRegressor(hidden_layer_sizes=(50,), max_iter=1000)

model.fit(X_train, y_train)

predictions_mlp = model.predict(X_test)
mse_mlp = mean_squared_error(y_test, predictions_mlp)
rmse_mlp = np.sqrt(mse_mlp)
mae_mlp = mean_absolute_error(y_test, predictions_mlp)
medae_mlp = median_absolute_error(y_test, predictions_mlp)

print(f"Mean Squared Error: {mse_mlp}")
print(f"Root Mean Squared Error: {rmse_mlp}")
print(f"Mean Absolute Error: {mae_mlp}")
print(f"Median Absolute Error: {medae_mlp}")

Mean Squared Error: 0.8771372202423775
Root Mean Squared Error: 0.9365560422325925
Mean Absolute Error: 0.7583445391932999
Median Absolute Error: 0.7394914371385382




Compute mse, rmse, mae medae for GradientBoosting

In [41]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
GBR = GradientBoostingRegressor(n_estimators=10, learning_rate=0.1)

model = MultiOutputRegressor(GBR)

model.fit(X_train, y_train)

predictions_gbr = model.predict(X_test)
mse_gbr = mean_squared_error(y_test, predictions_gbr)
rmse_gbr = np.sqrt(mse_gbr)
mae_gbr = mean_absolute_error(y_test, predictions_gbr)
medae_gbr = median_absolute_error(y_test, predictions_gbr)

print(f"Mean Squared Error: {mse_gbr}")
print(f"Root Mean Squared Error: {rmse_gbr}")
print(f"Mean Absolute Error: {mae_gbr}")
print(f"Median Absolute Error: {medae_gbr}")

Mean Squared Error: 0.5779350709230859
Root Mean Squared Error: 0.7602204094360305
Mean Absolute Error: 0.5937202484746504
Median Absolute Error: 0.45404024528870063
