In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

station = pd.read_csv("./094/station.csv", sep='\t', engine="python")
patient = pd.read_csv("./094/patient.csv", sep='\t', engine="python")
observation = pd.read_csv("./094/observation.csv", sep='\t', engine="python")

print("patient.shape =", patient.shape)
print("station.shape =", station.shape)
print("observation.shape =", observation.shape)

### 1.1 Z√°kladn√Ω opis d√°t spolu s ich charakteristikami 

### (A-1b)

In [None]:
# (A-1b)
print("patient columns:", patient.columns.tolist())
print("station columns:", station.columns.tolist())
print("observation columns:", observation.columns.tolist())

#### S√∫bor patient.csv
- **Poƒçet z√°znamov:** 2 102  
- **Poƒçet atrib√∫tov:** 13  
- **Typy d√°t:** object = 10, int64 = 2, float64 = 1  
- **Ch√Ωbaj√∫ce hodnoty:** spolu ‚âà 3 993  
  - Najviac ch√Ωba: `residence` (100 %), `job` (70 %), `address` (15 %), `current_location` (5 %).  
- **Charakteristika:** obsahuje demografick√© √∫daje pacientov a odkaz na stanicu (`station_ID`).  
  Tento odkaz sa **ned√° priamo sp√°rova≈•** s n√°zvom v `s√∫bore station.csv`.  

#### S√∫bor station.csv
- **Poƒçet z√°znamov:** 798  
- **Poƒçet atrib√∫tov:** 6  
- **Typy d√°t:** object = 4, float64 = 2  
- **Ch√Ωbaj√∫ce hodnoty:** 0  
- **Charakteristika:** obsahuje inform√°cie o merac√≠ch staniciach ‚Äì `station`, `latitude`, `longitude`, `QoS`, `revision`, `location`.  
- **Pozorovanie:** hodnoty `revision` maj√∫ r√¥zne form√°ty d√°tumov (a ƒças≈• nevieme spr√°vne parsova≈•) ‚Üí potrebn√° normaliz√°cia na jednotn√Ω form√°t `datetime`.

#### S√∫bor observation.csv
- **Poƒçet z√°znamov:** 12 081  
- **Poƒçet atrib√∫tov:** 23  
- **Typy d√°t:** v≈°etky `float64`  
- **Ch√Ωbaj√∫ce hodnoty:** 0  
- **Cieƒæov√° premenn√°:** `oximetry` (bin√°rna 0/1).  
- **D√¥le≈æit√© atrib√∫ty:** `SpO‚ÇÇ`, `HR`, `Skin Temperature`, `BP`, `CO`, `FiO‚ÇÇ`, atƒè.  
- Hodnoty SpO‚ÇÇ s√∫ v rozsahu 95 ‚Äì 100 a Skin Temperature v rozsahu 33 ‚Äì 38 ¬∞C.  
- Tieto d√°ta maj√∫ vhodn√Ω form√°t pre ƒèal≈°ie spracovanie v Python/pandas a na tr√©novanie modelov.


In [None]:
def dtype_counts(df): 
    return df.dtypes.astype(str).value_counts().to_dict()

summ = {
    "patient": {
        "shape": patient.shape,
        "dtype_counts": dtype_counts(patient),
        "missing_total": int(patient.isna().sum().sum())
    },
    "station": {
        "shape": station.shape,
        "dtype_counts": dtype_counts(station),
        "missing_total": int(station.isna().sum().sum())
    },
    "observation": {
        "shape": observation.shape,
        "dtype_counts": dtype_counts(observation),
        "missing_total": int(observation.isna().sum().sum())
    }
}
summ


Anal√Ωza d√°t ‚Äì z√°znamy a atrib√∫ty
| S√∫bor               | Poƒçet z√°znamov (riadkov) | Poƒçet atrib√∫tov (stƒ∫pcov) | Typy d√°t                            | Poƒçet ch√Ωbaj√∫cich hodn√¥t |
| :------------------ | :----------------------: | :-----------------------: | :---------------------------------- | :----------------------: |
| **patient.csv**     |           2 102          |             13            | object = 10, int64 = 2, float64 = 1 |           3 993          |
| **station.csv**     |            798           |             6             | object = 4, float64 = 2             |             0            |
| **observation.csv** |          12 081          |             23            | float64 = 23                        |             0            |


#### Anal√Ωza ch√Ωbaj√∫cich hodn√¥t (EDA)
Na z√°klade v≈°etk√Ωch vynechan√Ωch hodn√¥t, ktor√© som vy≈°≈°ie uviedol, skontrolujeme, ƒço presne bolo vynechan√©.

In [None]:
missing_pct = (patient.isna().sum() / len(patient) * 100).round(2).sort_values(ascending=False)
missing_pct.head(10)

#### Z v√Ωpoƒçtu percenta ch√Ωbaj√∫cich hodn√¥t vid√≠me, ≈æe niektor√© atrib√∫ty obsahuj√∫ v√Ωrazn√Ω poƒçet pr√°zdnych z√°znamov:
residence	100 %
job	‚âà 70 %
address	‚âà 15 %
current_location	‚âà 5 %

## (B-1b)

In [None]:
skin_temp = observation["Skin Temperature"]
spo = observation["SpO‚ÇÇ"]
hr =  observation["HR"]
pi = observation["PI"]
rr = observation["RR"]  
prv = observation["PRV"]
bp = observation["BP"]
pvi = observation["PVI"]
sv = observation["SV"]
co = observation["CO"]

In [None]:
cols = ["Skin Temperature", "SpO‚ÇÇ", "HR", "PI", "RR", "PRV", "BP", "PVI", "SV", "CO"]
observation[cols].describe()

In [None]:
plt.subplot(1, 2, 1)
plt.hist(skin_temp, bins=60, edgecolor="black")
plt.xlabel("Skin Temperature ¬∞C")


plt.subplot(1, 2, 2)
plt.boxplot(skin_temp)
plt.xlabel("Skin Temperature ¬∞C")

plt.show()


The temperature values are in a range 33-38 ¬∞C, mean: 35.9 ¬∞C and std: 0.84
This indicates that measurements are within the normal range and do not vary significantly

In [None]:
plt.subplot(1, 2, 1)
plt.hist(spo, bins=60, edgecolor="black")
plt.xlabel("SpO %")


plt.subplot(1, 2, 2)
plt.boxplot(spo)
plt.xlabel("SpO %")

plt.show()

In [None]:
plt.subplot(1, 2, 1)
plt.hist(hr, bins=60, edgecolor="black")
plt.xlabel("HR bpm")

plt.subplot(1, 2, 2)
plt.boxplot(hr)
plt.xlabel("HR bpm")

plt.show()

In [None]:
plt.subplot(1, 2, 1)
plt.hist(pi, bins=60, edgecolor="black")
plt.xlabel("PI %")

plt.subplot(1, 2, 2)
plt.boxplot(pi)
plt.xlabel("PI %")

plt.show()

In [None]:
plt.subplot(1, 2, 1)
plt.hist(rr, bins=60, edgecolor="black")
plt.xlabel("RR")

plt.subplot(1, 2, 2)
plt.boxplot(rr)
plt.xlabel("RR")

plt.show()

In [None]:
plt.subplot(1, 2, 1)
plt.hist(prv, bins=60, edgecolor="black")
plt.ylabel("PRV ms")

plt.subplot(1, 2, 2)
plt.boxplot(prv)
plt.xlabel("PRV ms")

plt.show()

Pulse rate variability are in a range 20-200ms, mean: 117.62, std: 21.83
This indicates that the average value is within the normal range, but some values are widely scattered

In [None]:
plt.subplot(1, 2, 1)
plt.hist(bp, bins=60, edgecolor="black")
plt.xlabel("BP")

plt.subplot(1, 2, 2)
plt.boxplot(bp)
plt.xlabel("BP")

plt.show()

In [None]:
plt.subplot(1, 2, 1)
plt.hist(pvi, bins=60, edgecolor="black")
plt.xlabel("pvi")

plt.subplot(1, 2, 2)
plt.boxplot(pvi)
plt.xlabel("pvi")

plt.show()

In [None]:
plt.subplot(1, 2, 1)
plt.hist(sv, bins=60, edgecolor="black")
plt.xlabel("sv")

plt.subplot(1, 2, 2)
plt.boxplot(sv)
plt.xlabel("sv")

plt.show()

In [None]:
plt.subplot(1, 2, 1)
plt.hist(co, bins=60, edgecolor="black")
plt.xlabel("co")

plt.subplot(1, 2, 2)
plt.boxplot(co)
plt.xlabel("co")

plt.show()

## (–°-1)

In [None]:
print(patient.columns)

In [None]:
numeric = observation.select_dtypes(include=['number'])

corr = numeric.corr()

plt.figure(figsize=(14,10))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Korelaƒçn√° matica fyziologick√Ωch atrib√∫tov (observation.csv)")
plt.show()

### Interpret√°cia v√Ωsledkov p√°rovej anal√Ωzy d√°t

Z korelaƒçnej matice (obr√°zok vy≈°≈°ie) je mo≈æn√© identifikova≈• viacero v√Ωznamn√Ωch vz≈•ahov medzi fyziologick√Ωmi atrib√∫tmi:

- **CO a HR (r = 0.76)** ‚Äì veƒæmi siln√° pozit√≠vna korel√°cia, ktor√° zodpoved√° oƒçak√°vanej z√°vislosti medzi srdcovou frekvenciou a srdcov√Ωm v√Ωdajom.  
- **PVI a Blood Flow Index (r = 0.67)** ‚Äì siln√Ω vz≈•ah medzi variabilitou perf√∫zie a prietokom krvi.  
- **Oximetry a PVI (r = 0.67)** ‚Äì satur√°cia kysl√≠kom √∫zko s√∫vis√≠ s variabilitou perf√∫zie.  
- **Skin Temperature a PI (r = ‚àí0.49)** ‚Äì negat√≠vna korel√°cia; s rast√∫cou teplotou ko≈æe kles√° perf√∫zny index.  
- **Skin Temperature a Oximetry (r = 0.37)** ‚Äì mierna pozit√≠vna z√°vislos≈•, naznaƒçuje mo≈æn√© prepojenie medzi perif√©rnou teplotou a satur√°ciou.  
- V√§ƒç≈°ina ostatn√Ωch atrib√∫tov (napr. `latitude`, `longitude`, `SNR`, `PRV`) nevykazuje ≈°tatisticky v√Ωznamn√© line√°rne vz≈•ahy.

Tieto zistenia poukazuj√∫ na fyziologick√© s√∫vislosti medzi vybran√Ωmi premenn√Ωmi
a pom√°haj√∫ urƒçi≈•, ktor√© atrib√∫ty m√¥≈æu by≈• relevantn√© pri bud√∫com modelovan√≠
a predikcii cieƒæovej premennej `oximetry`.


In [None]:
corr_pairs = corr.unstack().sort_values(ascending=False)
corr_pairs = corr_pairs[(corr_pairs < 0.999) & (corr_pairs > -0.999)]
print("üîù Top 10 korel√°ci√≠ medzi atrib√∫tmi:\n")
print(corr_pairs.head(10))

In [None]:
pairs_to_plot = [
    ("CO", "HR"),
    ("oximetry", "PVI"),
    ("Skin Temperature", "oximetry"),
]

for x, y in pairs_to_plot:
    plt.figure(figsize=(6, 4))
    sns.scatterplot(data=observation, x=x, y=y, alpha=0.6)
    plt.title(f"Vz≈•ah medzi {x} a {y}")
    plt.xlabel(x)
    plt.ylabel(y)
    plt.show()

## (D-1b)

In [None]:
corr = observation.corr(numeric_only=False)
corr['oximetry'].sort_values()

In [None]:
sns.scatterplot(data=observation, x='PVI', y='oximetry')
plt.xlabel("PVI %")
plt.ylabel("oximetry")

In [None]:
sns.scatterplot(data=observation, x='Skin Temperature', y='oximetry')
plt.xlabel("Skin Temperature ¬∞C")
plt.ylabel("oximetry")

In [None]:
sns.scatterplot(data=observation, x='EtCO‚ÇÇ', y='oximetry')
plt.xlabel("EtCO‚ÇÇ mmHg")
plt.ylabel("oximetry")

## (E-1b)

–ü–æ—Å–ª–µ –ø—Ä–µ–¥–≤–∞—Ä–∏—Ç–µ–ª—å–Ω–æ –∞–Ω–∞–ª–∏–∑–∞ –º–æ–∂–Ω–æ —É–≤–∏–¥–µ—Ç—å —á—Ç–æ –°O –∏ HR –∑–∞–≤–∏—Å—è—Ç –º–µ–∂–¥—É —Å–æ–æ–±–æ–π –æ—á–µ–Ω—å —Å–∏–ª—å–Ω–æ (r = 0.76). –ù–∞—à–∞ —Ü–µ–ª–µ–≤–∞—è –ø–µ—Ä–µ–º–µ–Ω–Ω–∞—è **oximetry** –∑–∞–≤–∏—Å–∏—Ç –æ—Ç PVI (r = 0.66), Skin Temperature (r = 0.368), **$EtCO‚ÇÇ$** (r = 0.281) –∞ —Ç–∞–∫ –∂–µ –Ω–µ–≥–∞—Ç–∏–≤–Ω—É—é –∫–æ—Ä—Ä–µ–ª—è—Ü–∏—é —Å **$SpO_2$** (r = -0.121). –¢–∞–∫ –∂–µ –¥–∞—Ç–∞—Å–µ—Ç—ã –º–µ–∂–¥—É —Å–æ–±–æ–π —Å–≤—è–∑—ã–≤–∞—Ç—å –Ω–µ –Ω–∞–¥–æ —Ç–∞–∫ –∫–∞–∫ –≤—Å—è –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—è –∫–æ—Ç–æ—Ä–∞—è –Ω–∞–º –Ω—É–∂–Ω–∞ —É–∂–µ –Ω–∞—Ö–æ–¥–∏—Ç—å—Å—è –≤ –¥–∞—Ç–∞—Å–µ—Ç–µ **observation**. 

## (A-2b)

Check data types

In [None]:
observation.dtypes

In [None]:
station.dtypes

In [None]:
patient.dtypes

Format data types

In [None]:
station['revision'] = pd.to_datetime(station['revision'], format="mixed")
station['station'] = station['station'].astype('string')
station['QoS'] = station['QoS'].astype('category')
station['location'] = station['location'].astype('string')

In [None]:
patient['job'] = patient['job'].astype('string')
patient['ssn'] = patient['ssn'].astype('string')
patient['blood_group'] = patient['blood_group'].astype('category')
patient['company'] = patient['company'].astype('string')
patient['name'] = patient['name'].astype('string')
patient['username'] = patient['username'].astype('string')
patient['residence'] = patient['residence'].astype('string')
patient['registration'] = patient['registration'].astype('string')
patient['address'] = patient['address'].astype('string')
patient['mail'] = patient['mail'].astype('string')
patient[['longitude', 'latitude']] = (
    patient['current_location']
    .astype(str)
    .str.extract(r"Decimal\('([\d\.\-]+)'\), Decimal\('([\d\.\-]+)'\)")
    .astype(float)
)
patient.drop(columns=['current_location'], inplace=True)

Check nulls

In [None]:
observation.isnull().sum()

In [None]:
station.isnull().sum()

In [None]:
patient.isnull().sum()

In [None]:
patient['residence'] = patient['residence'].fillna('Unknown')
patient['job'] = patient['job'].fillna('Unknown')
patient['address'] = patient['address'].fillna('Unknown')
patient['longitude'] = patient['longitude'].fillna(0)
patient['latitude'] = patient['latitude'].fillna(0)

Check duplicates

In [None]:
observation.duplicated().sum()

In [None]:
observation = observation.drop_duplicates()

In [None]:
station.duplicated().sum()

In [None]:
patient.duplicated().sum()

## (B-2b)

In [None]:
import re

ranges = pd.read_csv("./094/sensor_variable_range.csv", sep="\t")
print(ranges.head())

num_pat = re.compile(r"[-+]?\d+(?:[.,]\d+)?")
def parse_range(s):
    nums = num_pat.findall(str(s))
    if len(nums) >= 2:
        a = float(nums[0].replace(",", "."))
        b = float(nums[1].replace(",", "."))
        lo, hi = (a, b) if a <= b else (b, a)
        return lo, hi
    return np.nan, np.nan

ranges[["Min", "Max"]] = ranges["Value Range"].apply(lambda r: pd.Series(parse_range(r)))
ranges = ranges.dropna(subset=["Min", "Max"])
ranges.loc[ranges["Variable"] == "BP", ["Min", "Max"]] = [90.0, 120.0]
print(ranges[["Variable", "Min", "Max"]])

In [None]:
anomalies = []
for _, row in ranges.iterrows():
    var = row["Variable"]
    low, high = row["Min"], row["Max"]
    if var in observation.columns:
        vals = pd.to_numeric(observation[var], errors="coerce")
        invalid_mask = vals.lt(low) | vals.gt(high)
        anomalies.append({
            "Atrib√∫t": var,
            "Poƒçet abnorm√°lnych hodn√¥t": int(invalid_mask.sum()),
            "Min povolen√©": low,
            "Max povolen√©": high,
            "Pr√≠klady idx": list(invalid_mask[invalid_mask].index[:5]) 
        })

anomalies_df = pd.DataFrame(anomalies).sort_values("Poƒçet abnorm√°lnych hodn√¥t", ascending=False)
anomalies_df

### Kontrola spr√°vnosti v d√°tach

D√°ta z *observation.csv* boli porovnan√© s referenƒçn√Ωmi rozsahmi fyziologick√Ωch parametrov zo *sensor_variable_range.csv*.  
V ≈æiadnom z atrib√∫tov neboli zisten√© abnorm√°lne hodnoty mimo definovan√Ωch intervalov,  
ƒço naznaƒçuje, ≈æe dataset neobsahuje chybn√© alebo extr√©mne merania.  

Pre istotu bola ƒèalej vykonan√° kontrola nelogick√Ωch kombin√°ci√≠ hodn√¥t
(vz≈•ahov medzi atrib√∫tmi), ktor√© by mohli naznaƒçova≈• chyby senzora alebo anot√°cie.

In [None]:
logic_errors = []

# –î–∞–≤–ª–µ–Ω–∏–µ = 0 –ø—Ä–∏ –Ω–∞–ª–∏—á–∏–∏ –ø—É–ª—å—Å–∞ ‚Äî —Å–µ–Ω—Å–æ—Ä–Ω–∞—è –æ—à–∏–±–∫–∞
if "BP" in observation.columns and "HR" in observation.columns:
    mask = (observation["BP"] == 0) & (observation["HR"] > 0)
    logic_errors.append(("BP = 0 a HR > 0", mask.sum()))

# –ü—Ä–æ–≤–µ—Ä–∫–∞ —Å–æ–≥–ª–∞—Å–æ–≤–∞–Ω–Ω–æ—Å—Ç–∏ —Å–µ—Ä–¥–µ—á–Ω–æ–≥–æ –≤—ã–±—Ä–æ—Å–∞: CO ‚âà HR √ó SV / 1000
if all(col in observation.columns for col in ["CO", "HR", "SV"]):
    co_est = observation["HR"] * observation["SV"] / 1000.0
    mask = (observation["CO"] - co_est).abs() > 0.5 * co_est.fillna(0).abs()
    logic_errors.append(("|CO - HR*SV/1000| > 30%", mask.sum()))

# –í—ã—Å–æ–∫–æ–µ –∫–∞—á–µ—Å—Ç–≤–æ —Å–∏–≥–Ω–∞–ª–∞, –Ω–æ –Ω–∏–∑–∫–∏–π SNR ‚Äî –Ω–µ–ª–æ–≥–∏—á–Ω–æ
if "Signal Quality Index" in observation.columns and "SNR" in observation.columns:
    mask = (observation["Signal Quality Index"] >= 80) & (observation["SNR"] < 20)
    logic_errors.append(("Signal Quality Index ‚â• 80 a SNR < 20", mask.sum()))

# –ù–∏–∑–∫–æ–µ –∫–∞—á–µ—Å—Ç–≤–æ —Å–∏–≥–Ω–∞–ª–∞, –Ω–æ –∏–¥–µ–∞–ª—å–Ω–∞—è SpO‚ÇÇ ‚Äî –ø–æ–¥–æ–∑—Ä–∏—Ç–µ–ª—å–Ω–æ
if "Signal Quality Index" in observation.columns and "SpO‚ÇÇ" in observation.columns:
    mask = (observation["Signal Quality Index"] <= 10) & (observation["SpO‚ÇÇ"] >= 99)
    logic_errors.append(("Signal Quality Index ‚â§ 10 a SpO‚ÇÇ ‚â• 99", mask.sum()))

# –ö–æ–º–Ω–∞—Ç–Ω—ã–π –∫–∏—Å–ª–æ—Ä–æ–¥ (FiO‚ÇÇ ‚âà 21%), –Ω–æ –Ω–∏–∑–∫–∞—è SpO‚ÇÇ ‚Äî –≤–æ–∑–º–æ–∂–Ω–∞—è –æ—à–∏–±–∫–∞
if "FiO‚ÇÇ" in observation.columns and "SpO‚ÇÇ" in observation.columns:
    mask = (observation["FiO‚ÇÇ"] <= 22) & (observation["SpO‚ÇÇ"] < 85)
    logic_errors.append(("FiO‚ÇÇ ‚âà 21% a SpO‚ÇÇ < 85%", mask.sum()))

# –ì–µ–æ–∫–æ–æ—Ä–¥–∏–Ω–∞—Ç—ã –≤–Ω–µ –¥–æ–ø—É—Å—Ç–∏–º–æ–≥–æ –¥–∏–∞–ø–∞–∑–æ–Ω–∞
if "latitude" in observation.columns and "longitude" in observation.columns:
    mask = (
        observation["latitude"].lt(-90)
        | observation["latitude"].gt(90)
        | observation["longitude"].lt(-180)
        | observation["longitude"].gt(180)
    )
    logic_errors.append(("Latitude/Longitude mimo rozsah", mask.sum()))

# –ë–æ–Ω—É—Å: RR –∏ EtCO‚ÇÇ –Ω–µ –¥–æ–ª–∂–Ω—ã —Ä–∞—Å—Ö–æ–¥–∏—Ç—å—Å—è –±–æ–ª–µ–µ —á–µ–º –Ω–∞ –ø–æ—Ä—è–¥–æ–∫
if "RR" in observation.columns and "EtCO‚ÇÇ" in observation.columns:
    mask = (observation["RR"] > 40) & (observation["EtCO‚ÇÇ"] < 20)
    logic_errors.append(("RR > 40 a EtCO‚ÇÇ < 20", mask.sum()))

logic_df = pd.DataFrame(logic_errors, columns=["Podmienka", "Poƒçet poru≈°en√≠"])
logic_df

### Kontrola spr√°vnosti v d√°tach

Na z√°klade referenƒçn√Ωch rozsahov zo s√∫boru *sensor_variable_range.csv* bola vykonan√° kontrola spr√°vnosti hodn√¥t v datasete *observation.csv*.

- Neboli zisten√© ≈æiadne **abnorm√°lne hodnoty** mimo povolen√Ωch fyziologick√Ωch intervalov.
- N√°sledne bola vykonan√° aj **kontrola logick√Ωch vz≈•ahov** medzi atrib√∫tmi:
  - `BP = 0 a HR > 0`
  - `|CO ‚Äì HR√óSV/1000| > 30 %`
  - `Signal Quality Index ‚â• 80 a SNR < 20`
  - `Signal Quality Index ‚â§ 10 a SpO‚ÇÇ ‚â• 99`
  - `FiO‚ÇÇ ‚âà 21 % a SpO‚ÇÇ < 85 %`
  - `Latitude/Longitude mimo rozsah`
  - `RR > 40 a EtCO‚ÇÇ < 20`

V≈°etky podmienky mali **0 poru≈°en√≠**, ƒço znamen√°, ≈æe dataset neobsahuje nelogick√© alebo chybn√© kombin√°cie √∫dajov.  
D√°ta s√∫ teda **konzistentn√©, bez anom√°li√≠** a vhodn√© na ƒèal≈°iu f√°zu projektu ‚Äì anal√Ωzu vz≈•ahov (C-1) a modelovanie cieƒæovej premennej `oximetry`.

## (C-1b) 

In [None]:
def identify_outliers(a):
    lower = a.quantile(0.25) - 1.5 * stats.iqr(a)
    upper = a.quantile(0.75) + 1.5 * stats.iqr(a)
    
    return a[(a > upper) | (a < lower)]

In [None]:
co_out = identify_outliers(co)
co = co.drop(co_out.index)

pvi_out = identify_outliers(pvi)
pvi = pvi.drop(pvi_out.index)

sv_out = identify_outliers(sv)
sv = sv.drop(sv_out.index)

bp_out = identify_outliers(bp)
bp = bp.drop(bp_out.index)

In [None]:
plt.figure(figsize=(10, 8))
plt.subplot(2, 4, 1)
plt.hist(pvi, bins=60, edgecolor="black")
plt.xlabel("pvi")

plt.subplot(2, 4, 2)
plt.boxplot(pvi)
plt.xlabel("pvi")

plt.subplot(2, 4, 3)
plt.hist(co, bins=60, edgecolor="black")
plt.xlabel("co")

plt.subplot(2, 4, 4)
plt.boxplot(co)
plt.xlabel("co")

plt.subplot(2, 4, 5)
plt.hist(sv, bins=60, edgecolor="black")
plt.xlabel("sv")

plt.subplot(2, 4, 6)
plt.boxplot(sv)
plt.xlabel("sv")

plt.subplot(2, 4, 7)
plt.hist(bp, bins=60, edgecolor="black")
plt.xlabel("BP")

plt.subplot(2, 4, 8)
plt.boxplot(bp)
plt.xlabel("BP")


plt.show()

In [None]:
def replace_outliers(a):
    lower = a.quantile(0.05)
    upper = a.quantile(0.95)
    
    clipped = a.clip(lower, upper)
    
    return clipped

In [None]:
prv = replace_outliers(prv)

skin_temp = replace_outliers(skin_temp)

hr = replace_outliers(hr)

spo = replace_outliers(spo)

In [None]:
plt.figure(figsize=(10, 8))
plt.subplot(2, 4, 1)
plt.hist(prv, bins=60, edgecolor="black")
plt.ylabel("PRV ms")

plt.subplot(2, 4, 2)
plt.boxplot(prv)
plt.xlabel("PRV ms")

plt.subplot(2, 4, 3)
plt.hist(skin_temp, bins=60, edgecolor="black")
plt.xlabel("Skin Temperature ¬∞C")

plt.subplot(2, 4, 4)
plt.boxplot(skin_temp)
plt.xlabel("Skin Temperature ¬∞C")

plt.subplot(2, 4, 5)
plt.hist(hr, bins=60, edgecolor="black")
plt.xlabel("HR bpm")

plt.subplot(2, 4, 6)
plt.boxplot(hr)
plt.xlabel("HR bpm")

plt.subplot(2, 4, 7)
plt.hist(spo, bins=60, edgecolor="black")
plt.xlabel("SpO %")


plt.subplot(2, 4, 8)
plt.boxplot(spo)
plt.xlabel("SpO %")

plt.show()

# 1.3

SpO‚ÇÇ m√° v priemere ni≈æ≈°iu hodnotu pri vy≈°≈°ej respiraƒçnej n√°mahe. –†—ñ–≤–µ–Ω—å –Ω–∞—Å–∏—á–µ–Ω–Ω—è –∫—Ä–æ–≤—ñ –∫–∏—Å–Ω–µ–º (SpO‚ÇÇ) —î –Ω–∏–∂—á–∏–º –ø—ñ–¥ —á–∞—Å –≤–∏—Å–æ–∫–æ–≥–æ –¥–∏—Ö–∞–ª—å–Ω–æ–≥–æ –Ω–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–Ω—è.

In [None]:
low_effort = observation.loc[observation["Respiratory effort"] <= observation["Respiratory effort"].median(), "SpO‚ÇÇ"]
high_effort = observation.loc[observation["Respiratory effort"] > observation["Respiratory effort"].median(), "SpO‚ÇÇ"]
print("Priemer SpO‚ÇÇ pri n√≠zkej n√°mahe:", low_effort.mean())
print("Priemer SpO‚ÇÇ pri vysokej n√°mahe:", high_effort.mean())

In [None]:
sh_low = stats.shapiro(low_effort.sample(5000, random_state=0))
sh_high = stats.shapiro(high_effort.sample(5000, random_state=0))
print("Shapiro p (low):", sh_low.pvalue)
print("Shapiro p (high):", sh_high.pvalue)

Shapiro-Wilk test uk√°zal, ≈æe d√°ta pri n√≠zkej n√°mahe s√∫ norm√°lne rozdelen√© (p = 0.065 > 0.05),
zatiaƒæ ƒço pri vysokej n√°mahe sa rozdelenie odli≈°uje od norm√°lneho (p = 0.000008 < 0.05).
Preto bol okrem t-testu pou≈æit√Ω aj neparametrick√Ω Mann‚ÄìWhitney test.

In [None]:
lev = stats.levene(low_effort, high_effort)
print("Levene p:", lev.pvalue)

Levene test (p = 0.537 > 0.05) potvrdil, ≈æe variancie medzi skupinami s√∫ rovnak√©.
T√Ωm p√°dom mo≈æno pou≈æi≈• ≈°tandardn√Ω Welchov t-test, ktor√Ω nepredpoklad√° presne rovnak√© variancie, ale je robustn√Ω voƒçi men≈°√≠m rozdielom.

In [None]:
t, p = stats.ttest_ind(low_effort, high_effort, equal_var=False, nan_policy="omit")
print(f"t = {t:.3f}, p = {p/2:.4f} (jednostrann√Ω test, oƒçak√°vame ni≈æ≈°iu SpO‚ÇÇ pri vysokej n√°mahe)")

Welchov t-test (t = ‚Äì0.711, p = 0.239 > 0.05) nepreuk√°zal ≈°tatisticky v√Ωznamn√Ω rozdiel medzi priemermi SpO‚ÇÇ pri n√≠zkej a vysokej n√°mahe.


In [None]:
u, p_mwu = stats.mannwhitneyu(low_effort, high_effort, alternative="less")
print(f"Mann-Whitney p(one-sided) = {p_mwu:.4f}")

Mann‚ÄìWhitney U test (p = 0.147 > 0.05) taktie≈æ nepreuk√°zal ≈°tatisticky v√Ωznamn√Ω rozdiel medzi skupinami. Hypot√©za, ≈æe SpO‚ÇÇ je ni≈æ≈°ia pri vysokej n√°mahe, sa nepotvrdila

Hypot√©za H1 (SpO‚ÇÇ m√° v priemere ni≈æ≈°iu hodnotu pri vy≈°≈°ej respiraƒçnej n√°mahe) nebola potvrden√°.
Test normality uk√°zal, ≈æe √∫daje pri vysokej n√°mahe sa odchyƒæuj√∫ od norm√°lneho rozdelenia (Shapiro p < 0.05),
preto bol okrem t-testu pou≈æit√Ω aj neparametrick√Ω Mann-Whitney test.
Ani jeden test nepreuk√°zal ≈°tatisticky v√Ωznamn√Ω rozdiel medzi skupinami (t = ‚Äì0.71, p = 0.24; U-test p = 0.15).
Hodnoty SpO‚ÇÇ ostali stabiln√© bez ohƒæadu na √∫rove≈à respiraƒçnej n√°mahy.

RR m√° v priemere nizsiu hodnotu pri vy≈°≈°ej FiO‚ÇÇ
(—Ç–æ–±—Ç–æ –æ—á—ñ–∫—É—î–º–æ, —â–æ —á–∞—Å—Ç–æ—Ç–∞ –¥–∏—Ö–∞–Ω–Ω—è –±—É–¥–µ –º–µ–Ω—à–æ—é –ø—Ä–∏ –≤–∏—â—ñ–π –∫–æ–Ω—Ü–µ–Ω—Ç—Ä–∞—Ü—ñ—ó –∫–∏—Å–Ω—é)

In [None]:
high_fio2 = observation.loc[observation["FiO‚ÇÇ"] > observation["FiO‚ÇÇ"].median(), "RR"]
low_fio2 = observation.loc[observation["FiO‚ÇÇ"] <= observation["FiO‚ÇÇ"].median(), "RR"]
print("Priemer RR pri ni≈æ≈°ej FiO‚ÇÇ:", low_fio2.mean())
print("Priemer RR pri vy≈°≈°ej FiO‚ÇÇ:", high_fio2.mean())

In [None]:
sh_low = stats.shapiro(low_fio2.sample(5000, random_state=0))
sh_high = stats.shapiro(high_fio2.sample(5000, random_state=0))
print("Shapiro p (low):", sh_low.pvalue)
print("Shapiro p (high):", sh_high.pvalue)

–û–±–∏–¥–≤–∞ p > 0.05, —Ç–æ–±—Ç–æ —Ä–æ–∑–ø–æ–¥—ñ–ª –Ω–æ—Ä–º–∞–ª—å–Ω–∏–π.
‚Üí –º–æ–∂–Ω–∞ —Å–ø–æ–∫—ñ–π–Ω–æ –≤–∏–∫–æ—Ä–∏—Å—Ç–æ–≤—É–≤–∞—Ç–∏ parametrick√Ω t-test.

In [None]:
lev = stats.levene(low_fio2, high_fio2)
print("Levene p:", lev.pvalue)

p > 0.05, —Ç–æ–±—Ç–æ –¥–∏—Å–ø–µ—Ä—Å—ñ—ó –º—ñ–∂ –≥—Ä—É–ø–∞–º–∏ –Ω–µ —Ä—ñ–∑–Ω—è—Ç—å—Å—è —Å—É—Ç—Ç—î–≤–æ.

In [None]:
u, p_mwu = stats.mannwhitneyu(high_fio2, low_fio2, alternative="greater")
print(f"Mann-Whitney p(one-sided)={p_mwu:.4f}")

–¶–µ –æ–∑–Ω–∞—á–∞—î, —â–æ –∑–∞ –Ω–µ –ø–∞—Ä–∞–º–µ—Ç—Ä–∏—á–Ω–∏–º —Ç–µ—Å—Ç–æ–º —Ä—ñ–∑–Ω–∏—Ü—è –Ω–µ –ø—ñ–¥—Ç–≤–µ—Ä–¥–∏–ª–∞—Å—å (p > 0.05). –¢–æ–±—Ç–æ —Ä–æ–∑–ø–æ–¥—ñ–ª–∏ –º—ñ–∂ –≥—Ä—É–ø–∞–º–∏ –º–∞–π–∂–µ –æ–¥–Ω–∞–∫–æ–≤—ñ.

In [None]:
t, p = stats.ttest_ind(high_fio2, low_fio2, equal_var=False, nan_policy="omit")
print(f"t = {t:.3f}, p = {p/2:.4f} (jednostrann√Ω test, oƒçak√°vame vy≈°≈°iu RR pri vy≈°≈°ej FiO‚ÇÇ)")

Hypot√©za H5 (RR m√° v priemere n hodnotu pri vy≈°≈°ej FiO‚ÇÇ) bola ƒçiastoƒçne potvrden√°.
V√Ωsledky testov uk√°zali, ≈æe √∫daje maj√∫ norm√°lne rozdelenie (Shapiro p > 0.05) a rovnak√∫ varianciu (Levene p = 0.33).
Welchov t-test preuk√°zal ≈°tatisticky v√Ωznamn√Ω rozdiel medzi skupinami (t = ‚Äì1.70, p = 0.046).
Mann-Whitney test tento v√Ωsledok nepotvrdil (p = 0.96).

# (B - 1)


(RR vs FiO2)

In [None]:
from statsmodels.stats.power import TTestIndPower
n1 = len(high_fio2)
n2 = len(low_fio2)

mean1, mean2 = high_fio2.mean(), low_fio2.mean()
sd1, sd2 = high_fio2.std(ddof=1), low_fio2.std(ddof=1)

In [None]:
spooled = np.sqrt(((n1 - 1)*sd1**2 + (n2 - 1)*sd2**2) / (n1 + n2 - 2))
d = (mean1 - mean2) / spooled
print(f"Cohen‚Äôs d = {d:.4f}")

In [None]:
analysis = TTestIndPower()
power = analysis.power(effect_size=abs(d), nobs1=n1, ratio=n2/n1, alpha=0.05)
print(f"≈†tatistick√° sila testu (power) = {power:.3f}")

In [None]:
mde = analysis.solve_power(effect_size=None, nobs1=n1, ratio=n2/n1, alpha=0.05, power=0.8)
print(f"Minim√°lny detekovateƒæn√Ω efekt (MDE) pri sile 0.8 = {mde:.3f}")

Pre hypotezu H2, aj keƒè t-test preuk√°zal ≈°tatisticky v√Ωznamn√Ω rozdiel medzi skupinami (p = 0.046),
vypoƒç√≠tan√° veƒækos≈• efektu Cohen‚Äôs d = 0.03 naznaƒçuje, ≈æe rozdiel je veƒæmi mal√Ω.
≈†tatistick√° sila testu (power = 0.398) bola ni≈æ≈°ia ako odpor√∫ƒçan√° hodnota 0.8,
ƒço znamen√°, ≈æe test m√° len obmedzen√∫ schopnos≈• spoƒæahlivo zachyti≈• tak√©to mal√© rozdiely.
V√Ωsledok je teda ≈°tatisticky v√Ωznamn√Ω, ale prakticky zanedbateƒæn√Ω ‚Äì
rozdiel medzi skupinami existuje, no jeho veƒækos≈• nem√° re√°lny fyziologick√Ω v√Ωznam.

In [None]:
n1 = len(high_effort)
n2 = len(low_effort)
mean1, mean2 = high_effort.mean(), low_effort.mean()
sd1, sd2 = high_effort.std(ddof=1), low_effort.std(ddof=1)


In [None]:
spooled = np.sqrt(((n1-1)*sd1**2 + (n2-1)*sd2**2) / (n1+n2-2))
d = (mean1 - mean2) / spooled
print(f"Cohen's d = {d:.4f}")

In [None]:
analysis = TTestIndPower()
power = analysis.power(effect_size=abs(d), nobs1=n1, ratio=n2/n1, alpha=0.05)
print(f"≈†tatistick√° sila testu (power) = {power:.3f}")

In [None]:
mde = analysis.solve_power(effect_size=None, nobs1=n1, ratio=n2/n1, alpha=0.05, power=0.8)
print(f"Minim√°lny detekovateƒæn√Ω efekt (MDE) pri sile 0.8 = {mde:.3f}")

Pre hypot√©zu H1 (SpO‚ÇÇ m√° v priemere ni≈æ≈°iu hodnotu pri vy≈°≈°ej respiraƒçnej n√°mahe) bola vypoƒç√≠tan√° veƒækos≈• efektu Cohen‚Äôs d = 0.013, ƒço predstavuje zanedbateƒæn√Ω rozdiel medzi skupinami.
≈†tatistick√° sila testu (power = 0.110) bola n√≠zka, ƒço znamen√°, ≈æe test nem√° dostatoƒçn√∫ citlivos≈• na zachytenie veƒæmi mal√Ωch rozdielov.
Minim√°lny detekovateƒæn√Ω efekt (MDE) bol 0.051, teda test by spoƒæahlivo zachytil len v√§ƒç≈°ie rozdiely, ne≈æ sa re√°lne v d√°tach vyskytli.
Na z√°klade toho mo≈æno kon≈°tatova≈•, ≈æe rozdiel medzi √∫rov≈àami respiraƒçnej n√°mahy nie je ≈°tatisticky v√Ωznamn√Ω a hodnoty SpO‚ÇÇ zost√°vaj√∫ prakticky rovnak√©.