In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
observation = pd.read_csv("./094/observation.csv", sep='\t', engine="python")

## 2.1 A

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
y = observation['oximetry'].values
x = observation.drop(columns=['oximetry'], axis=1).values

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# train_df = X_train.copy()
# train_df["oximetry"] = y_train
# 
# test_df = X_test.copy()
# test_df["oximetry"] = y_test
# 
# train_df.to_csv("train_raw.csv", index=False)
# test_df.to_csv("test_raw.csv", index=False)

Here we divided our dataset into training and testing sets

## 2.1 B

In [None]:
import pandas as pd

X_train = pd.DataFrame(X_train)

### Check types

In [None]:
X_train.dtypes

### Check nulls

In [None]:
from sklearn.impute import SimpleImputer

if X_train.isnull().sum().sum() > 0:
    imputer = SimpleImputer(strategy='median')
    X_train = imputer.fit_transform(X_train)

If there are missing values here, we will replace them with the median 

### Check duplicates

In [None]:
if X_train.duplicated().sum() > 0:
    X_train = X_train.drop_duplicates()
    y_train = y_train.loc[X_train.index]

If there are duplicates here, we will remove them

## 2.1 C


### Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler 

scaler = MinMaxScaler()

X_mm = scaler.fit_transform(X_train) 

X_mm

In [None]:
from sklearn.preprocessing import StandardScaler 

scaler = StandardScaler()

X_std = scaler.fit_transform(X_train) 

X_std

In [None]:
print("StandardScaler:")
print("Mean: ", np.mean(X_train, axis=0).round(3))
print("Std: ", np.std(X_train, axis=0).round(3))

print("\nMinMaxScaler:")
print("Min: ", np.min(X_train, axis=0).round(3))
print("Max: ", np.max(X_train, axis=0).round(3))

In [None]:
print("StandardScaler:")
print("Mean: ", np.mean(X_std, axis=0).round(3))
print("Std: ", np.std(X_std, axis=0).round(3))

print("\nMinMaxScaler:")
print("Min: ", np.min(X_mm, axis=0).round(3))
print("Max: ", np.max(X_mm, axis=0).round(3))

After applying **StandardScaler**, the average value of each feature became close to 0, and the standard deviation became close to 1, confirming that the data was scaled correctly.

After applying **MinMaxScaler**, the minimum values of the features became equal to 0, and the maximum values became equal to 1, also confirming that the normalization worked correctly.

### Transformers

In [None]:
from matplotlib import pyplot

pyplot.hist(X_train[0], bins=10)

Our initial histogram of the distribution of feature values

In [None]:
from sklearn.preprocessing import PowerTransformer

power = PowerTransformer(method='yeo-johnson', standardize=True) 
X_pt = power.fit_transform(X_train)

pyplot.hist(X_pt[0], bins=10) 

After **PowerTransformer**

In [None]:
from sklearn.preprocessing import QuantileTransformer

power = QuantileTransformer(output_distribution='normal', random_state=42)
X_qt = power.fit_transform(X_train)

pyplot.hist(X_qt[0], bins=10)

After **QuantileTransformer**

In [None]:
from scipy.stats import skew

skews = skew(X_train, axis=0)

skew_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Skewness': skews
})

skew_df

Here we check whether the distribution is normal

## 2.1 D

We divided our dataset into **training** (80%) and **test** (20%) samples

Further work was carried out only with the **training** sample

We compared two approaches - **scaling** and **transformation** - and analyzed the distribution of features (checked for skew) to assess how close the data was to a normal distribution

Most features had a normal distribution, but the presence of some skewed features led us to decide to use **QuantileTransformer** for subsequent data preprocessing

## 2.2 A

### Корреляционный анализ (линейная связь)

In [None]:
x_1 = observation.drop(columns=['oximetry'], axis=1)

X_qt_df = pd.DataFrame(X_qt, columns=x_1.columns)
df_corr = X_qt_df.copy()
df_corr["oximetry"] = y_train

corr_matrix = df_corr.corr(numeric_only=True)

corr_sorted = corr_matrix["oximetry"].reindex(
    corr_matrix["oximetry"].abs().sort_values(ascending=False).index
)
corr_sorted

In [None]:
plt.figure(figsize=(10,5))
# sns.heatmap(corr_sorted, annot=True, cmap='coolwarm')
sns.heatmap(corr_sorted.to_frame(name="Pearson r"), annot=True, cmap="coolwarm")
plt.show()

### ANOVA F-test (feature selection test)

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression

X = pd.DataFrame(X_qt, columns=x_1.columns) 
y = y_train 

selector = SelectKBest(score_func=f_regression, k='all')
X_new = selector.fit_transform(X, y)

idx = selector.get_support(indices=True)
selected_features = X.columns[idx]

anova_results = pd.DataFrame({
    'Feature': selected_features,
    'F_value': selector.scores_[idx],
    'p_value': selector.pvalues_[idx]
}).sort_values(by='F_value', ascending=False).reset_index(drop=True)

print("Shape после отбора:", X_new.shape)
print("Выбранные признаки:", list(selected_features))
print(anova_results)

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(data=anova_results, x='F_value', y='Feature')
plt.xscale('log')
plt.title('ANOVA F-test (log scale)')
plt.show()

### Mutual Information (нелинейная зависимость)

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression
import pandas as pd
import matplotlib.pyplot as plt

X = pd.DataFrame(X_qt, columns=x_1.columns) 
y = y_train 

selector = SelectKBest(score_func=mutual_info_regression, k='all')
selector.fit(X, y)

scores = pd.Series(abs(selector.scores_), index=X.columns).sort_values()

plt.figure(figsize=(10,5))
scores.plot(kind='barh')
plt.show()

print(scores.sort_values(ascending=False))

In [None]:
corr_part = (corr_sorted.drop(labels=['oximetry'], errors='ignore').abs().rename('Pearson_|r|'))
anova_part = (anova_results.set_index('Feature')['F_value'].rename('ANOVA_F'))
mi_part = scores.rename('Mutual_Info')

all_idx = corr_part.index.union(anova_part.index).union(mi_part.index)

compare_simple = pd.concat([corr_part, anova_part, mi_part], axis=1).reindex(all_idx).drop(index='oximetry', errors='ignore')
display(compare_simple.sort_values('Pearson_|r|', ascending=False))


def safe_minmax(series):
    s_min, s_max = series.min(), series.max()
    if s_max == s_min:
        return pd.Series(0.0, index=series.index)
    return (series - s_min) / (s_max - s_min)

norm = compare_simple.apply(safe_minmax, axis=0)

norm['Mean_Score'] = norm.mean(axis=1)

final_display = norm.sort_values('Mean_Score', ascending=False)

final_display.style.format('{:.6f}')

## 2.2 B

In [None]:
ranked_features = final_display.sort_values('Mean_Score', ascending=False).head(5).copy()
display(ranked_features.style.format('{:.6f}'))

topk = (final_display.reset_index().head(5))

plt.figure(figsize=(10,6))
sns.barplot(data=topk, x='Mean_Score', y='index')
plt.title('Top 5')
plt.show()

## 2.2 C
### (C-1b) Zdôvodnenie rozhodnutí pri realizácii

Pri realizácii úlohy som sa rozhodol využiť tri komplementárne prístupy na hodnotenie informatívnosti atribútov:
**Pearsonova korelácia**, **ANOVA F-test** a **Mutual Information**.  
Tieto techniky reprezentujú tri rôzne pohľady na vzťah medzi vstupnými premennými a cieľovou premennou *oximetry*:

- **Pearsonova korelácia** umožňuje zachytiť **lineárnu závislosť** medzi atribútom a cieľovou premennou.
- **ANOVA F-test** zisťuje, či existujú **štatisticky významné rozdiely v priemeroch** medzi skupinami hodnôt a hodnotí lineárnu relevantnosť premenných.
- **Mutual Information** je **nelineárna metóda**, ktorá kvantifikuje množstvo informácie, ktorú jeden atribút poskytuje o inom, a teda odhaľuje aj zložitejšie vzťahy.

Výber týchto troch metód umožnil porovnať lineárne aj nelineárne súvislosti a získať
robustnejší prehľad o dôležitosti atribútov.

Všetky výsledky boli následne **normalizované pomocou min–max transformácie** do intervalu ⟨0, 1⟩,  
aby bolo možné spravodlivo porovnávať hodnoty z rôznych metód, ktoré majú odlišné jednotky a rozsahy.
Z týchto normalizovaných hodnôt bol vypočítaný **priemerný index informatívnosti (Mean Score)**,
ktorý vyjadruje celkovú dôležitosť atribútu naprieč metódami.

Týmto postupom bolo zabezpečené:
- porovnateľnosť výsledkov z rôznych štatistických metód,
- odstránenie vplyvu rozdielnych mierok hodnôt (napr. F-hodnoty vs. korelácia),
- transparentný a reprodukovateľný spôsob zoradenia atribútov podľa dôležitosti.

Takto spracované výsledky umožňujú jednoznačne určiť,
ktoré atribúty majú najväčší vplyv na predikciu *oximetry*
a sú vhodné pre ďalšiu tvorbu predikčných modelov.


## 2.3 A 

In [None]:
from sklearn.preprocessing import QuantileTransformer
from sklearn.impute import SimpleImputer

num_imputer = SimpleImputer(strategy="median")
num_imputer.fit(X_train)

X_train_imp = num_imputer.transform(X_train)
X_test_imp  = num_imputer.transform(X_test)

power = QuantileTransformer(output_distribution='normal', random_state=42)
power.fit(X_train_imp)

X_train_pt = power.transform(X_train_imp)
X_test_pt  = power.transform(X_test_imp)

## 2.3 B


In [None]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import QuantileTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge

model = make_pipeline(
    SimpleImputer(strategy="median"),
    QuantileTransformer(output_distribution='normal', random_state=42),
    Ridge(alpha=1.0, random_state=42) 
)

model.fit(X_train, y_train)

r2 = model.score(X_test, y_test)
print(f"R^2 на тесте: {r2:.4f}")

y_pred = model.predict(X_test[:5])
print("Пример предсказаний:", y_pred)
