In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.preprocessing import QuantileTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import QuantileTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [None]:
observation = pd.read_csv("./094/observation.csv", sep='\t', engine="python")

## 2.1 A

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
y = observation['oximetry'].values
x = observation.drop(columns=['oximetry'], axis=1).values

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# train_df = X_train.copy()
# train_df["oximetry"] = y_train
# 
# test_df = X_test.copy()
# test_df["oximetry"] = y_test
# 
# train_df.to_csv("train_raw.csv", index=False)
# test_df.to_csv("test_raw.csv", index=False)

Here we divided our dataset into training and testing sets

## 2.1 B

In [None]:
import pandas as pd

X_train = pd.DataFrame(X_train)

### Check types

In [None]:
X_train.dtypes

### Check nulls

In [None]:
from sklearn.impute import SimpleImputer

if X_train.isnull().sum().sum() > 0:
    imputer = SimpleImputer(strategy='median')
    X_train = imputer.fit_transform(X_train)

If there are missing values here, we will replace them with the median 

### Check duplicates

In [None]:
if X_train.duplicated().sum() > 0:
    X_train = X_train.drop_duplicates()
    y_train = y_train.loc[X_train.index]

If there are duplicates here, we will remove them

## 2.1 C


### Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler 

scaler = MinMaxScaler()

X_mm = scaler.fit_transform(X_train) 

X_mm

After applying **MinMaxScaler**, all features were normalized to a range from 0 to 1, providing a uniform scale for all variables

In [None]:
from sklearn.preprocessing import StandardScaler 

scaler = StandardScaler()

X_std = scaler.fit_transform(X_train) 

X_std

After applying **StandardScaler**, the data was standardized: each feature has a mean value of 0 and a standard deviation of 1

In [None]:
print("StandardScaler:")
print("Mean: ", np.mean(X_train, axis=0).round(3))
print("Std: ", np.std(X_train, axis=0).round(3))

print("\nMinMaxScaler:")
print("Min: ", np.min(X_train, axis=0).round(3))
print("Max: ", np.max(X_train, axis=0).round(3))

In [None]:
print("StandardScaler:")
print("Mean: ", np.mean(X_std, axis=0).round(3))
print("Std: ", np.std(X_std, axis=0).round(3))

print("\nMinMaxScaler:")
print("Min: ", np.min(X_mm, axis=0).round(3))
print("Max: ", np.max(X_mm, axis=0).round(3))

After applying **StandardScaler**, the average value of each feature became close to 0, and the standard deviation became close to 1, confirming that the data was scaled correctly.

After applying **MinMaxScaler**, the minimum values of the features became equal to 0, and the maximum values became equal to 1, also confirming that the normalization worked correctly.

### Transformers

In [None]:
from matplotlib import pyplot

pyplot.hist(X_train[0], bins=10)

Our initial histogram of the distribution of feature values

In [None]:
from sklearn.preprocessing import PowerTransformer

power = PowerTransformer(method='yeo-johnson', standardize=True) 
X_pt = power.fit_transform(X_train)

pyplot.hist(X_pt[0], bins=10) 

After **PowerTransformer**

In [None]:
from sklearn.preprocessing import QuantileTransformer

power = QuantileTransformer(output_distribution='normal', random_state=42)
X_qt = power.fit_transform(X_train)

pyplot.hist(X_qt[0], bins=10)

After **QuantileTransformer**

In [None]:
from scipy.stats import skew

skews = skew(X_train, axis=0)

skew_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Skewness': skews
})

skew_df

The results show that most features have a distribution close to normal, but two features have a skewness value > 1, indicating their strong asymmetry. This means that the distribution of these features is asymmetrical.

## 2.1 D

We divided our dataset into **training** (80%) and **test** (20%) samples

Further work was carried out only with the **training** sample

We compared two approaches - **scaling** and **transformation** - and analyzed the distribution of features (checked for skew) to assess how close the data was to a normal distribution

Most features had a normal distribution, but the presence of some skewed features led us to decide to use **QuantileTransformer** for subsequent data preprocessing

## 2.2 A

### Correlation analysis (linear relationship)

In [None]:
x_1 = observation.drop(columns=['oximetry'], axis=1)

X_qt_df = pd.DataFrame(X_qt, columns=x_1.columns)
df_corr = X_qt_df.copy()
df_corr["oximetry"] = y_train

corr_matrix = df_corr.corr(numeric_only=True)

corr_sorted = corr_matrix["oximetry"].reindex(corr_matrix["oximetry"].abs().sort_values(ascending=False).index)
corr_sorted

#### Calculation of correlation between attributes and target variable
Pearson's correlation was calculated to assess the linear relationship between variables.
Before calculation, the input data was standardized (X_qt). 
The target variable **oximetry** was temporarily added to the dataset to calculate the correlation coefficients between it and the other attributes. 
The attributes were then sorted by absolute correlation value in descending order, identifying the most significant variables.

In [None]:
plt.figure(figsize=(10,5))
# sns.heatmap(corr_sorted, annot=True, cmap='coolwarm')
sns.heatmap(corr_sorted.to_frame(name="Pearson r"), annot=True, cmap="coolwarm")
plt.show()

### ANOVA F-test

In [None]:
X = pd.DataFrame(X_qt, columns=x_1.columns)
y = y_train

scores, pvals = f_regression(X, y)

anova_results = (
    pd.DataFrame({'Feature': X.columns, 'F_value': scores, 'p_value': pvals})
      .sort_values('F_value', ascending=False, ignore_index=True)
)

print(anova_results)

#### Selecting attributes using the ANOVA F-test
The **f_regression** test was used to assess the linear relationship between individual attributes and the target variable **oximetry**.  
For each attribute, the value of the **F-statistic** and the corresponding **p-values** are calculated separately, which show how strongly the attribute is related to the target variable.  
Higher F values and lower p-values mean that the attribute has a greater influence on the target variable.

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(data=anova_results, x='F_value', y='Feature')
plt.xscale('log')
plt.title('ANOVA F-test (log)')
plt.show()

### Mutual Information (non-linear dependence)

In [None]:
X = pd.DataFrame(X_qt, columns=x_1.columns) 
y = y_train 

selector = SelectKBest(score_func=mutual_info_regression, k='all')
selector.fit(X, y)

scores = pd.Series(abs(selector.scores_), index=X.columns).sort_values()

plt.figure(figsize=(10,5))
scores.plot(kind='barh')
plt.show()

print(scores.sort_values(ascending=False))

#### Mutual Information Calculation
The **mutual_info_regression** method was used to identify nonlinear relationships between attributes and the target variable **oximetry**. A higher MI value indicates a stronger (not necessarily linear) relationship.  

In [None]:
cmp_simple = pd.concat([
    corr_sorted.drop('oximetry', errors='ignore').abs().rename('|r|'),
    anova_results.set_index('Feature')['F_value'].rename('ANOVA_F'),
    scores.rename('Mutual_Info')
], axis=1)

norm = (cmp_simple - cmp_simple.min()) / (cmp_simple.max() - cmp_simple.min())

final_display = (norm.assign(Mean_Score=norm.mean(axis=1)).sort_values('Mean_Score', ascending=False))

display(cmp_simple.sort_values('|r|', ascending=False))
display(final_display.style.format('{:.6f}'))

#### Normalisation of values
Since individual metrics (Pearson, ANOVA, MI) have different ranges of values,
they were normalised using **minimum-maximum scaling** to the interval ⟨0, 1⟩.

## 2.2 B

### Ranking of identified attributes by importance

All analysed attributes were ranked by their combined **Mean_Score**,
which was obtained as the average of the normalised values of three different methods:
Pearson |r|, ANOVA F, and Mutual Information.
This made it possible to determine the relative importance of each attribute
in relation to the target variable **oximetry**.

The table above shows the complete ranking of all attributes,
where a higher **Mean_Score** value indicates greater attribute significance.
For better visualisation, the 5 attributes with the highest scores are shown separately below.

In [None]:
ranked_f = final_display.sort_values('Mean_Score', ascending=False).head(5).copy()
display(ranked_f.style.format('{:.6f}'))

topk = (final_display.reset_index().head(5))

plt.figure(figsize=(10,6))
sns.barplot(data=topk, x='Mean_Score', y='index')
plt.show()

## 2.2 C
### Justification of decisions during implementation

When solving the problem, we used three approaches to assess the importance of attributes:
**Pearson's correlation**, **ANOVA F-test**, and **Mutual Information**.  
These three methods provide different insights into how the input variables are related to the target variable *oximetry*:

**Pearson's correlation** reflects the linear relationship between the attribute and the target.  

**ANOVA F-test** determines whether there are significant differences in the mean values between groups of values.

**Mutual information** shows how much information one attribute provides about another, i.e. it also reveals non-linear relationships.

By combining these three methods, I was able to compare linear and nonlinear relationships
and get a better idea of which attributes are most important for the task.

I then **normalised the results using min–max transformation** to the interval ⟨0, 1⟩,
so that values from different methods with different ranges could be compared.  
From these normalised values, I calculated the **Mean Score**,
which shows the overall importance of each attribute.

This approach helped me compare the results from several points of view,
eliminate differences in scales, and obtain a clear order of attributes depending on their influence on the *oximetry* variable.

## 2.3 A 

In [None]:
num_imputer = SimpleImputer(strategy="median")
num_imputer.fit(X_train)

X_train_imp = num_imputer.transform(X_train)
X_test_imp  = num_imputer.transform(X_test)

power = QuantileTransformer(output_distribution='normal', random_state=42)
power.fit(X_train_imp)

X_train_pt = power.transform(X_train_imp)
X_test_pt  = power.transform(X_test_imp)

## 2.3 B


In [None]:
model = make_pipeline(SimpleImputer(strategy="median"), QuantileTransformer(output_distribution='normal', random_state=42), Ridge(random_state=42))

model.fit(X_train, y_train)

r2 = model.score(X_test, y_test)
print(f"R^2 test: {r2:.4f}")

y_pred_all = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred_all))
print("MSE:", mean_squared_error(y_test, y_pred_all))


The model achieved R² = 0.4719, which means that it explains about 47% of the variability in the target variable of oximetry.
It is not perfect, but it shows that the model reflects a significant portion of the relationships in the data.
The average error MAE = 0.288 means that the predictions differ from the actual values by an average of approximately 0.29,
which is acceptable for health data.
MSE = 0.126 confirms that most predictions are relatively close to the correct values.
Overall, the model predicts fairly well, and the result can be considered a moderately good basis for further improvement.