In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

station = pd.read_csv("./094/station.csv", sep='\t', engine="python")
patient = pd.read_csv("./094/patient.csv", sep='\t', engine="python")
observation = pd.read_csv("./094/observation.csv", sep='\t', engine="python")

print("patient.shape =", patient.shape)
print("station.shape =", station.shape)
print("observation.shape =", observation.shape)

patient.shape = (2102, 13)
station.shape = (798, 6)
observation.shape = (12081, 23)


In [None]:
# (A-1b)
print("patient columns:", patient.columns.tolist())
print("station columns:", station.columns.tolist())
print("observation columns:", observation.columns.tolist())

#### S√∫bor patient.csv
- **Poƒçet z√°znamov:** 2 102  
- **Poƒçet atrib√∫tov:** 13  
- **Typy d√°t:** object = 10, int64 = 2, float64 = 1  
- **Ch√Ωbaj√∫ce hodnoty:** spolu ‚âà 3 993  
  - Najviac ch√Ωba: `residence` (100 %), `job` (70 %), `address` (15 %), `current_location` (5 %).  
- **Charakteristika:** obsahuje demografick√© √∫daje pacientov a odkaz na stanicu (`station_ID`).  
  Tento odkaz sa **ned√° priamo sp√°rova≈•** s n√°zvom v `s√∫bore station.csv`.  

#### S√∫bor station.csv
- **Poƒçet z√°znamov:** 798  
- **Poƒçet atrib√∫tov:** 6  
- **Typy d√°t:** object = 4, float64 = 2  
- **Ch√Ωbaj√∫ce hodnoty:** 0  
- **Charakteristika:** obsahuje inform√°cie o merac√≠ch staniciach ‚Äì `station`, `latitude`, `longitude`, `QoS`, `revision`, `location`.  
- **Pozorovanie:** hodnoty `revision` maj√∫ r√¥zne form√°ty d√°tumov (a ƒças≈• nevieme spr√°vne parsova≈•) ‚Üí potrebn√° normaliz√°cia na jednotn√Ω form√°t `datetime`.

#### S√∫bor observation.csv
- **Poƒçet z√°znamov:** 12 081  
- **Poƒçet atrib√∫tov:** 23  
- **Typy d√°t:** v≈°etky `float64`  
- **Ch√Ωbaj√∫ce hodnoty:** 0  
- **Cieƒæov√° premenn√°:** `oximetry` (bin√°rna 0/1).  
- **D√¥le≈æit√© atrib√∫ty:** `SpO‚ÇÇ`, `HR`, `Skin Temperature`, `BP`, `CO`, `FiO‚ÇÇ`, atƒè.  
- Hodnoty SpO‚ÇÇ s√∫ v rozsahu 95 ‚Äì 100 a Skin Temperature v rozsahu 33 ‚Äì 38 ¬∞C.  
- Tieto d√°ta maj√∫ vhodn√Ω form√°t pre ƒèal≈°ie spracovanie v Python/pandas a na tr√©novanie modelov.


In [22]:
def dtype_counts(df): 
    return df.dtypes.astype(str).value_counts().to_dict()

summ = {
    "patient": {
        "shape": patient.shape,
        "dtype_counts": dtype_counts(patient),
        "missing_total": int(patient.isna().sum().sum())
    },
    "station": {
        "shape": station.shape,
        "dtype_counts": dtype_counts(station),
        "missing_total": int(station.isna().sum().sum())
    },
    "observation": {
        "shape": observation.shape,
        "dtype_counts": dtype_counts(observation),
        "missing_total": int(observation.isna().sum().sum())
    }
}
summ


{'patient': {'shape': (2102, 13),
  'dtype_counts': {'string': 8,
   'Int64': 3,
   'category': 1,
   'datetime64[ns, UTC]': 1},
  'missing_total': 3993},
 'station': {'shape': (798, 6),
  'dtype_counts': {'Float64': 2,
   'string': 2,
   'datetime64[ns, UTC]': 1,
   'category': 1},
  'missing_total': 0},
 'observation': {'shape': (12081, 23),
  'dtype_counts': {'Float64': 22, 'Int64': 1},
  'missing_total': 0}}

#### Anal√Ωza ch√Ωbaj√∫cich hodn√¥t (EDA)


In [None]:
missing_pct = (patient.isna().sum() / len(patient) * 100).round(2).sort_values(ascending=False)
missing_pct.head(10)

#### Z v√Ωpoƒçtu percenta ch√Ωbaj√∫cich hodn√¥t vid√≠me, ≈æe niektor√© atrib√∫ty obsahuj√∫ v√Ωrazn√Ω poƒçet pr√°zdnych z√°znamov:

#### Atrib√∫t	Podiel ch√Ωbaj√∫cich hodn√¥t
residence	100 %
job	‚âà 70 %
address	‚âà 15 %
current_location	‚âà 5 %

#### Vz≈•ahy medzi s√∫bormi
| Vz≈•ah | Typ v√§zby | Popis |
|:--|:--:|:--|
| observation ‚Üî station | 1 : N | Ka≈æd√© meranie m√° priraden√∫ stanicu (100 % zhoda cez `latitude`, `longitude`). |
| patient ‚Üî station | ? | Nepodarilo sa sp√°rova≈• cez `current_location`, zhoda 0 %. |
| patient ‚Üî observation | ‚Äì | Ch√Ωba priame prepojenie (pacient ID v meraniach neexistuje). |

In [None]:
sta_lat = pd.to_numeric(station["latitude"], errors="coerce").round(4)
sta_lon = pd.to_numeric(station["longitude"], errors="coerce").round(4)
obs_lat = pd.to_numeric(observation["latitude"], errors="coerce").round(4)
obs_lon = pd.to_numeric(observation["longitude"], errors="coerce").round(4)

station_key = set(zip(sta_lat, sta_lon))
obs_key = list(zip(obs_lat, obs_lon))

obs_valid = sum([not (pd.isna(k[0]) or pd.isna(k[1])) for k in obs_key])
obs_match = sum([(k in station_key) for k in obs_key if not (pd.isna(k[0]) or pd.isna(k[1]))])

print(f"observation ‚Üí station match by coords: {obs_match}/{obs_valid} ({obs_match/obs_valid:.3f})")

# patient ‚Üî station:
import re
def parse_current_location(s):
    if not isinstance(s, str): 
        return np.nan, np.nan
    nums = re.findall(r"Decimal\\('([-+]?\\d*\\.?\\d+)'\\)", s)
    return (float(nums[0]), float(nums[1])) if len(nums)==2 else (np.nan, np.nan)

latlon = patient["current_location"].apply(parse_current_location)
pat_lat = pd.to_numeric([t[0] for t in latlon], errors="coerce").round(4)
pat_lon = pd.to_numeric([t[1] for t in latlon], errors="coerce").round(4)

pat_valid = np.isfinite(pat_lat).sum()
pat_match = sum([(k in station_key) for k in zip(pat_lat, pat_lon) if not (pd.isna(k[0]) or pd.isna(k[1]))])

print(f"patient ‚Üí station match by coords: {pat_match}/{pat_valid} ({(pat_match/pat_valid if pat_valid else 0):.3f})")


#### Vizualiz√°cia (EDA)
Histogramy rozlo≈æenia hodn√¥t:
- **SpO‚ÇÇ** ‚Üí zv√§ƒç≈°a v rozmedz√≠ 97 ‚Äì 99 %  
- **Skin Temperature** ‚Üí rozlo≈æenie 33 ‚Äì 38 ¬∞C s vrcholom okolo 36 ¬∞C  

Tieto vizualiz√°cie potvrdzuj√∫, ≈æe d√°ta maj√∫ spr√°vny a oƒçak√°van√Ω fyzio-rozsah.


In [None]:
# 1) SpO2
plt.figure()
observation["SpO‚ÇÇ"].hist(bins=30, edgecolor="black")
plt.title("Rozlo≈æenie SpO‚ÇÇ")
plt.xlabel("SpO‚ÇÇ")
plt.ylabel("Frekvencia")
plt.show()

# 2) Skin Temperature
plt.figure()
observation["Skin Temperature"].hist(bins=30, edgecolor="black")
plt.title("Rozlo≈æenie Skin Temperature")
plt.xlabel("¬∞C")
plt.ylabel("Frekvencia")
plt.show()

### Zhrnutie zisten√≠
- D√°ta maj√∫ zrozumiteƒæn√∫ ≈°trukt√∫ru a s√∫ v spr√°vnom form√°te na ƒèal≈°ie spracovanie.  
- Najviac probl√©mov m√° `s√∫bor patient.csv` ‚Üí mnoho ch√Ωbaj√∫cich hodn√¥t.  
- `s√∫bor station.csv` m√° nekonzistentn√© form√°ty v atrib√∫te `revision`.  
- `s√∫bor observation.csv` je ƒçist√Ω a vhodn√Ω na modelovanie (strojov√© uƒçenie).  
- Vz≈•ah `observation` ‚Üî `station` funguje perfektne, ale prepojenie pacientov ch√Ωba.  


# (C-1)

In [None]:
numeric = observation.select_dtypes(include=['number'])

corr = numeric.corr()

plt.figure(figsize=(14,10))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Korelaƒçn√° matica fyziologick√Ωch atrib√∫tov (observation.csv)")
plt.show()

### Interpret√°cia v√Ωsledkov p√°rovej anal√Ωzy d√°t

Z korelaƒçnej matice (obr√°zok vy≈°≈°ie) je mo≈æn√© identifikova≈• viacero v√Ωznamn√Ωch vz≈•ahov medzi fyziologick√Ωmi atrib√∫tmi:

- **CO a HR (r = 0.76)** ‚Äì veƒæmi siln√° pozit√≠vna korel√°cia, ktor√° zodpoved√° oƒçak√°vanej z√°vislosti medzi srdcovou frekvenciou a srdcov√Ωm v√Ωdajom.  
- **PVI a Blood Flow Index (r = 0.67)** ‚Äì siln√Ω vz≈•ah medzi variabilitou perf√∫zie a prietokom krvi.  
- **Oximetry a PVI (r = 0.67)** ‚Äì satur√°cia kysl√≠kom √∫zko s√∫vis√≠ s variabilitou perf√∫zie.  
- **Skin Temperature a PI (r = ‚àí0.49)** ‚Äì negat√≠vna korel√°cia; s rast√∫cou teplotou ko≈æe kles√° perf√∫zny index.  
- **Skin Temperature a Oximetry (r = 0.37)** ‚Äì mierna pozit√≠vna z√°vislos≈•, naznaƒçuje mo≈æn√© prepojenie medzi perif√©rnou teplotou a satur√°ciou.  
- V√§ƒç≈°ina ostatn√Ωch atrib√∫tov (napr. `latitude`, `longitude`, `SNR`, `PRV`) nevykazuje ≈°tatisticky v√Ωznamn√© line√°rne vz≈•ahy.

Tieto zistenia poukazuj√∫ na fyziologick√© s√∫vislosti medzi vybran√Ωmi premenn√Ωmi
a pom√°haj√∫ urƒçi≈•, ktor√© atrib√∫ty m√¥≈æu by≈• relevantn√© pri bud√∫com modelovan√≠
a predikcii cieƒæovej premennej `oximetry`.


In [None]:
corr_pairs = corr.unstack().sort_values(ascending=False)
corr_pairs = corr_pairs[(corr_pairs < 0.999) & (corr_pairs > -0.999)]
print("üîù Top 10 korel√°ci√≠ medzi atrib√∫tmi:\n")
print(corr_pairs.head(10))

In [None]:
pairs_to_plot = [
    ("CO", "HR"),
    ("oximetry", "PVI"),
    ("Skin Temperature", "oximetry"),
]

for x, y in pairs_to_plot:
    plt.figure(figsize=(6, 4))
    sns.scatterplot(data=observation, x=x, y=y, alpha=0.6)
    plt.title(f"Vz≈•ah medzi {x} a {y}")
    plt.xlabel(x)
    plt.ylabel(y)
    plt.show()

### Interpret√°cia v√Ωsledkov

Top 10 korel√°ci√≠ ukazuje, ≈æe:

- **CO a HR (r = 0.76)** ‚Äì siln√° line√°rna z√°vislos≈•, vy≈°≈°ia srdcov√° frekvencia znamen√° v√§ƒç≈°√≠ srdcov√Ω v√Ωdaj.  
- **Oximetry a PVI (r = 0.67)** ‚Äì satur√°cia kysl√≠kom √∫zko s√∫vis√≠ s variabilitou perf√∫zie.  
- **Skin Temperature a Oximetry (r = 0.37)** ‚Äì mierna pozit√≠vna korel√°cia, naznaƒçuje vplyv perif√©rnej teploty na satur√°ciu.  
- **EtCO‚ÇÇ a PI (r = 0.31)** ‚Äì slab≈°√≠, ale viditeƒæn√Ω pozit√≠vny vz≈•ah.  

Tieto v√Ωsledky naznaƒçuj√∫, ktor√© premenn√© m√¥≈æu ma≈• najv√§ƒç≈°√≠ v√Ωznam pri bud√∫cej predikcii
cieƒæovej premennej `oximetry`.


# (b - 2)

In [15]:
import re

ranges = pd.read_csv("./094/sensor_variable_range.csv", sep="\t")
print(ranges.head())

def parse_range(r):
    nums = re.findall(r"[\d\.]+", str(r))
    if len(nums) >= 2:
        return float(nums[0]), float(nums[1])
    else:
        return None, None
   
ranges[["Min", "Max"]] = ranges["Value Range"].apply(lambda r: pd.Series(parse_range(r)))
ranges = ranges.dropna(subset=["Min", "Max"])
ranges.loc[ranges["Variable"] == "BP", ["Min", "Max"]] = [90.0, 120.0]
print(ranges[["Variable", "Min", "Max"]])

  Variable         Unit                     Description        Value Range
0     SpO‚ÇÇ            %    Peripheral oxygen saturation            95‚Äì100%
1       HR          bpm  Heart rate from pulse oximeter         60‚Äì100 bpm
2       PI            %                 Perfusion index            0.2‚Äì20%
3       RR  breaths/min                Respiratory rate  12‚Äì20 breaths/min
4    EtCO‚ÇÇ         mmHg        End-tidal carbon dioxide         35‚Äì45 mmHg
                Variable   Min    Max
0                   SpO‚ÇÇ  95.0  100.0
1                     HR  60.0  100.0
2                     PI   0.2   20.0
3                     RR  12.0   20.0
4                  EtCO‚ÇÇ  35.0   45.0
5                   FiO‚ÇÇ  21.0  100.0
6                    PRV  20.0  200.0
7                     BP  90.0  120.0
8       Skin Temperature  33.0   38.0
10                   PVI  10.0   20.0
11              Hb level  12.0   18.0
12                    SV  60.0  100.0
13                    CO   4.0    8.

In [16]:
anomalies = []
for _, row in ranges.iterrows():
    var = row["Variable"]
    low, high = row["Min"], row["Max"]
    if var in observation.columns:
        vals = pd.to_numeric(observation[var], errors="coerce")
        invalid_mask = vals.lt(low) | vals.gt(high)
        anomalies.append({
            "Atrib√∫t": var,
            "Poƒçet abnorm√°lnych hodn√¥t": int(invalid_mask.sum()),
            "Min povolen√©": low,
            "Max povolen√©": high,
            "Pr√≠klady idx": list(invalid_mask[invalid_mask].index[:5]) 
        })

anomalies_df = pd.DataFrame(anomalies).sort_values("Poƒçet abnorm√°lnych hodn√¥t", ascending=False)
anomalies_df


Unnamed: 0,Atrib√∫t,Poƒçet abnorm√°lnych hodn√¥t,Min povolen√©,Max povolen√©,Pr√≠klady idx
0,SpO‚ÇÇ,0,95.0,100.0,[]
1,HR,0,60.0,100.0,[]
2,PI,0,0.2,20.0,[]
3,RR,0,12.0,20.0,[]
4,EtCO‚ÇÇ,0,35.0,45.0,[]
5,FiO‚ÇÇ,0,21.0,100.0,[]
6,PRV,0,20.0,200.0,[]
7,BP,0,90.0,120.0,[]
8,Skin Temperature,0,33.0,38.0,[]
9,PVI,0,10.0,20.0,[]


### B-2b Kontrola spr√°vnosti v d√°tach

D√°ta z *observation.csv* boli porovnan√© s referenƒçn√Ωmi rozsahmi fyziologick√Ωch parametrov zo *sensor_variable_range.csv*.  
V ≈æiadnom z atrib√∫tov neboli zisten√© abnorm√°lne hodnoty mimo definovan√Ωch intervalov,  
ƒço naznaƒçuje, ≈æe dataset neobsahuje chybn√© alebo extr√©mne merania.  

Pre istotu bola ƒèalej vykonan√° kontrola nelogick√Ωch kombin√°ci√≠ hodn√¥t
(vz≈•ahov medzi atrib√∫tmi), ktor√© by mohli naznaƒçova≈• chyby senzora alebo anot√°cie.

In [17]:
logic_errors = []

# –î–∞–≤–ª–µ–Ω–∏–µ = 0 –ø—Ä–∏ –Ω–∞–ª–∏—á–∏–∏ –ø—É–ª—å—Å–∞ ‚Äî —Å–µ–Ω—Å–æ—Ä–Ω–∞—è –æ—à–∏–±–∫–∞
if "BP" in observation.columns and "HR" in observation.columns:
    mask = (observation["BP"] == 0) & (observation["HR"] > 0)
    logic_errors.append(("BP = 0 a HR > 0", mask.sum()))

# –ü—Ä–æ–≤–µ—Ä–∫–∞ —Å–æ–≥–ª–∞—Å–æ–≤–∞–Ω–Ω–æ—Å—Ç–∏ —Å–µ—Ä–¥–µ—á–Ω–æ–≥–æ –≤—ã–±—Ä–æ—Å–∞: CO ‚âà HR √ó SV / 1000
if all(col in observation.columns for col in ["CO", "HR", "SV"]):
    co_est = observation["HR"] * observation["SV"] / 1000.0
    mask = (observation["CO"] - co_est).abs() > 0.5 * co_est.fillna(0).abs()
    logic_errors.append(("|CO - HR*SV/1000| > 30%", mask.sum()))

# –í—ã—Å–æ–∫–æ–µ –∫–∞—á–µ—Å—Ç–≤–æ —Å–∏–≥–Ω–∞–ª–∞, –Ω–æ –Ω–∏–∑–∫–∏–π SNR ‚Äî –Ω–µ–ª–æ–≥–∏—á–Ω–æ
if "Signal Quality Index" in observation.columns and "SNR" in observation.columns:
    mask = (observation["Signal Quality Index"] >= 80) & (observation["SNR"] < 20)
    logic_errors.append(("Signal Quality Index ‚â• 80 a SNR < 20", mask.sum()))

# –ù–∏–∑–∫–æ–µ –∫–∞—á–µ—Å—Ç–≤–æ —Å–∏–≥–Ω–∞–ª–∞, –Ω–æ –∏–¥–µ–∞–ª—å–Ω–∞—è SpO‚ÇÇ ‚Äî –ø–æ–¥–æ–∑—Ä–∏—Ç–µ–ª—å–Ω–æ
if "Signal Quality Index" in observation.columns and "SpO‚ÇÇ" in observation.columns:
    mask = (observation["Signal Quality Index"] <= 10) & (observation["SpO‚ÇÇ"] >= 99)
    logic_errors.append(("Signal Quality Index ‚â§ 10 a SpO‚ÇÇ ‚â• 99", mask.sum()))

# –ö–æ–º–Ω–∞—Ç–Ω—ã–π –∫–∏—Å–ª–æ—Ä–æ–¥ (FiO‚ÇÇ ‚âà 21%), –Ω–æ –Ω–∏–∑–∫–∞—è SpO‚ÇÇ ‚Äî –≤–æ–∑–º–æ–∂–Ω–∞—è –æ—à–∏–±–∫–∞
if "FiO‚ÇÇ" in observation.columns and "SpO‚ÇÇ" in observation.columns:
    mask = (observation["FiO‚ÇÇ"] <= 22) & (observation["SpO‚ÇÇ"] < 85)
    logic_errors.append(("FiO‚ÇÇ ‚âà 21% a SpO‚ÇÇ < 85%", mask.sum()))

# –ì–µ–æ–∫–æ–æ—Ä–¥–∏–Ω–∞—Ç—ã –≤–Ω–µ –¥–æ–ø—É—Å—Ç–∏–º–æ–≥–æ –¥–∏–∞–ø–∞–∑–æ–Ω–∞
if "latitude" in observation.columns and "longitude" in observation.columns:
    mask = (
        observation["latitude"].lt(-90)
        | observation["latitude"].gt(90)
        | observation["longitude"].lt(-180)
        | observation["longitude"].gt(180)
    )
    logic_errors.append(("Latitude/Longitude mimo rozsah", mask.sum()))

# –ë–æ–Ω—É—Å: RR –∏ EtCO‚ÇÇ –Ω–µ –¥–æ–ª–∂–Ω—ã —Ä–∞—Å—Ö–æ–¥–∏—Ç—å—Å—è –±–æ–ª–µ–µ —á–µ–º –Ω–∞ –ø–æ—Ä—è–¥–æ–∫
if "RR" in observation.columns and "EtCO‚ÇÇ" in observation.columns:
    mask = (observation["RR"] > 40) & (observation["EtCO‚ÇÇ"] < 20)
    logic_errors.append(("RR > 40 a EtCO‚ÇÇ < 20", mask.sum()))

logic_df = pd.DataFrame(logic_errors, columns=["Podmienka", "Poƒçet poru≈°en√≠"])
logic_df

Unnamed: 0,Podmienka,Poƒçet poru≈°en√≠
0,BP = 0 a HR > 0,0
1,|CO - HR*SV/1000| > 30%,0
2,Signal Quality Index ‚â• 80 a SNR < 20,0
3,Signal Quality Index ‚â§ 10 a SpO‚ÇÇ ‚â• 99,0
4,FiO‚ÇÇ ‚âà 21% a SpO‚ÇÇ < 85%,0
5,Latitude/Longitude mimo rozsah,0
6,RR > 40 a EtCO‚ÇÇ < 20,0


### B-2b Kontrola spr√°vnosti v d√°tach

Na z√°klade referenƒçn√Ωch rozsahov zo s√∫boru *sensor_variable_range.csv* bola vykonan√° kontrola spr√°vnosti hodn√¥t v datasete *observation.csv*.

- Neboli zisten√© ≈æiadne **abnorm√°lne hodnoty** mimo povolen√Ωch fyziologick√Ωch intervalov.
- N√°sledne bola vykonan√° aj **kontrola logick√Ωch vz≈•ahov** medzi atrib√∫tmi:
  - `BP = 0 a HR > 0`
  - `|CO ‚Äì HR√óSV/1000| > 30 %`
  - `Signal Quality Index ‚â• 80 a SNR < 20`
  - `Signal Quality Index ‚â§ 10 a SpO‚ÇÇ ‚â• 99`
  - `FiO‚ÇÇ ‚âà 21 % a SpO‚ÇÇ < 85 %`
  - `Latitude/Longitude mimo rozsah`
  - `RR > 40 a EtCO‚ÇÇ < 20`

V≈°etky podmienky mali **0 poru≈°en√≠**, ƒço znamen√°, ≈æe dataset neobsahuje nelogick√© alebo chybn√© kombin√°cie √∫dajov.  
D√°ta s√∫ teda **konzistentn√©, bez anom√°li√≠** a vhodn√© na ƒèal≈°iu f√°zu projektu ‚Äì anal√Ωzu vz≈•ahov (C-1) a modelovanie cieƒæovej premennej `oximetry`.


## (4b)