In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

station = pd.read_csv("./094/station.csv", sep='\t', engine="python")
patient = pd.read_csv("./094/patient.csv", sep='\t', engine="python")
observation = pd.read_csv("./094/observation.csv", sep='\t', engine="python")

print("patient.shape =", patient.shape)
print("station.shape =", station.shape)
print("observation.shape =", observation.shape)

### 1.1 Z√°kladn√Ω opis d√°t spolu s ich charakteristikami 

### (A-1b)

In [None]:
# (A-1b)
print("patient columns:", patient.columns.tolist())
print("station columns:", station.columns.tolist())
print("observation columns:", observation.columns.tolist())

#### ü©∫ S√∫bor patient.csv
- **Poƒçet z√°znamov:** 2 102  
- **Poƒçet atrib√∫tov:** 13  
- **Typy d√°t:** object = 10, int64 = 2, float64 = 1  
- **Ch√Ωbaj√∫ce hodnoty:** spolu ‚âà 3 993  
  - Najviac ch√Ωba: `residence` (100 %), `job` (70 %), `address` (15 %), `current_location` (5 %).  
- **Charakteristika:** obsahuje demografick√© √∫daje pacientov a odkaz na stanicu (`station_ID`).  
  Tento odkaz sa **ned√° priamo sp√°rova≈•** s n√°zvom v `s√∫bore station.csv`.  

#### ‚öôÔ∏è S√∫bor station.csv
- **Poƒçet z√°znamov:** 798  
- **Poƒçet atrib√∫tov:** 6  
- **Typy d√°t:** object = 4, float64 = 2  
- **Ch√Ωbaj√∫ce hodnoty:** 0  
- **Charakteristika:** obsahuje inform√°cie o merac√≠ch staniciach ‚Äì `station`, `latitude`, `longitude`, `QoS`, `revision`, `location`.  
- **Pozorovanie:** hodnoty `revision` maj√∫ r√¥zne form√°ty d√°tumov (a ƒças≈• nevieme spr√°vne parsova≈•) ‚Üí potrebn√° normaliz√°cia na jednotn√Ω form√°t `datetime`.

#### üìä S√∫bor observation.csv
- **Poƒçet z√°znamov:** 12 081  
- **Poƒçet atrib√∫tov:** 23  
- **Typy d√°t:** v≈°etky `float64`  
- **Ch√Ωbaj√∫ce hodnoty:** 0  
- **Cieƒæov√° premenn√°:** `oximetry` (bin√°rna 0/1).  
- **D√¥le≈æit√© atrib√∫ty:** `SpO‚ÇÇ`, `HR`, `Skin Temperature`, `BP`, `CO`, `FiO‚ÇÇ`, atƒè.  
- Hodnoty SpO‚ÇÇ s√∫ v rozsahu 95 ‚Äì 100 a Skin Temperature v rozsahu 33 ‚Äì 38 ¬∞C.  
- Tieto d√°ta maj√∫ vhodn√Ω form√°t pre ƒèal≈°ie spracovanie v Python/pandas a na tr√©novanie modelov.


In [None]:
def dtype_counts(df): 
    return df.dtypes.astype(str).value_counts().to_dict()

summ = {
    "patient": {
        "shape": patient.shape,
        "dtype_counts": dtype_counts(patient),
        "missing_total": int(patient.isna().sum().sum())
    },
    "station": {
        "shape": station.shape,
        "dtype_counts": dtype_counts(station),
        "missing_total": int(station.isna().sum().sum())
    },
    "observation": {
        "shape": observation.shape,
        "dtype_counts": dtype_counts(observation),
        "missing_total": int(observation.isna().sum().sum())
    }
}
summ


#### Anal√Ωza ch√Ωbaj√∫cich hodn√¥t (EDA)


In [None]:
missing_pct = (patient.isna().sum() / len(patient) * 100).round(2).sort_values(ascending=False)
missing_pct.head(10)

#### Z v√Ωpoƒçtu percenta ch√Ωbaj√∫cich hodn√¥t vid√≠me, ≈æe niektor√© atrib√∫ty obsahuj√∫ v√Ωrazn√Ω poƒçet pr√°zdnych z√°znamov:

#### Atrib√∫t	Podiel ch√Ωbaj√∫cich hodn√¥t
residence	100 %
job	‚âà 70 %
address	‚âà 15 %
current_location	‚âà 5 %

#### üîó Vz≈•ahy medzi s√∫bormi
| Vz≈•ah | Typ v√§zby | Popis |
|:--|:--:|:--|
| observation ‚Üî station | 1 : N | Ka≈æd√© meranie m√° priraden√∫ stanicu (100 % zhoda cez `latitude`, `longitude`). |
| patient ‚Üî station | ? | Nepodarilo sa sp√°rova≈• cez `current_location`, zhoda 0 %. |
| patient ‚Üî observation | ‚Äì | Ch√Ωba priame prepojenie (pacient ID v meraniach neexistuje). |

In [None]:
sta_lat = pd.to_numeric(station["latitude"], errors="coerce").round(4)
sta_lon = pd.to_numeric(station["longitude"], errors="coerce").round(4)
obs_lat = pd.to_numeric(observation["latitude"], errors="coerce").round(4)
obs_lon = pd.to_numeric(observation["longitude"], errors="coerce").round(4)

station_key = set(zip(sta_lat, sta_lon))
obs_key = list(zip(obs_lat, obs_lon))

obs_valid = sum([not (pd.isna(k[0]) or pd.isna(k[1])) for k in obs_key])
obs_match = sum([(k in station_key) for k in obs_key if not (pd.isna(k[0]) or pd.isna(k[1]))])

print(f"observation ‚Üí station match by coords: {obs_match}/{obs_valid} ({obs_match/obs_valid:.3f})")

# patient ‚Üî station:
import re
def parse_current_location(s):
    if not isinstance(s, str): 
        return (np.nan, np.nan)
    nums = re.findall(r"Decimal\\('([-+]?\\d*\\.?\\d+)'\\)", s)
    return (float(nums[0]), float(nums[1])) if len(nums)==2 else (np.nan, np.nan)

latlon = patient["current_location"].apply(parse_current_location)
pat_lat = pd.to_numeric([t[0] for t in latlon], errors="coerce").round(4)
pat_lon = pd.to_numeric([t[1] for t in latlon], errors="coerce").round(4)

pat_valid = np.isfinite(pat_lat).sum()
pat_match = sum([(k in station_key) for k in zip(pat_lat, pat_lon) if not (pd.isna(k[0]) or pd.isna(k[1]))])

print(f"patient ‚Üí station match by coords: {pat_match}/{pat_valid} ({(pat_match/pat_valid if pat_valid else 0):.3f})")


#### üìà Vizualiz√°cia (EDA)
Histogramy rozlo≈æenia hodn√¥t:
- **SpO‚ÇÇ** ‚Üí zv√§ƒç≈°a v rozmedz√≠ 97 ‚Äì 99 %  
- **Skin Temperature** ‚Üí rozlo≈æenie 33 ‚Äì 38 ¬∞C s vrcholom okolo 36 ¬∞C  

Tieto vizualiz√°cie potvrdzuj√∫, ≈æe d√°ta maj√∫ spr√°vny a oƒçak√°van√Ω fyzio-rozsah.


In [None]:
import matplotlib.pyplot as plt

# 1) SpO2
plt.figure()
observation["SpO‚ÇÇ"].hist(bins=30, edgecolor="black")
plt.title("Rozlo≈æenie SpO‚ÇÇ")
plt.xlabel("SpO‚ÇÇ")
plt.ylabel("Frekvencia")
plt.show()

# 2) Skin Temperature
plt.figure()
observation["Skin Temperature"].hist(bins=30, edgecolor="black")
plt.title("Rozlo≈æenie Skin Temperature")
plt.xlabel("¬∞C")
plt.ylabel("Frekvencia")
plt.show()


### üß© Zhrnutie zisten√≠
- D√°ta maj√∫ zrozumiteƒæn√∫ ≈°trukt√∫ru a s√∫ v spr√°vnom form√°te na ƒèal≈°ie spracovanie.  
- Najviac probl√©mov m√° `s√∫bor patient.csv` ‚Üí mnoho ch√Ωbaj√∫cich hodn√¥t.  
- `s√∫bor station.csv` m√° nekonzistentn√© form√°ty v atrib√∫te `revision`.  
- `s√∫bor observation.csv` je ƒçist√Ω a vhodn√Ω na modelovanie (strojov√© uƒçenie).  
- Vz≈•ah `observation` ‚Üî `station` funguje perfektne, ale prepojenie pacientov ch√Ωba.  


In [None]:
rev = station["revision"]
skin_temp = observation["Skin Temperature"]
spo = observation["SpO‚ÇÇ"]
print("REV \n", rev.describe(include="all"))
print("SPO \n", spo.describe(include="all"))
# skin_temp.value_counts().plot(kind="bar")


# skin_temp.plot()
#dups2 = patient.duplicated()


# report if there are any duplicates
# print(dups.any())
#print(dups2.any())


# list all duplicate rows
# print(station[dups])
#print(patient[dups2])

In [None]:
plt.hist(skin_temp, bins=60, edgecolor="black")
plt.show()
plt.hist(spo, bins=60, edgecolor="black")
plt.show()

## (B-1b)

In [None]:
skin_temp = observation["Skin Temperature"]
spo = observation["SpO‚ÇÇ"]
hr =  observation["HR"]
pi = observation["PI"]
rr = observation["RR"]  
prv = observation["PRV"]
bp = observation["BP"]
pvi = observation["PVI"]
sv = observation["SV"]
co = observation["CO"]

In [None]:
cols = ["Skin Temperature", "SpO‚ÇÇ", "HR", "PI", "RR", "PRV", "BP", "PVI", "SV", "CO"]
observation[cols].describe()

In [None]:
plt.hist(skin_temp, bins=60, edgecolor="black")
plt.xlabel("Skin Temperature ¬∞C")
plt.show()


The temperature values are in a range 33-38 ¬∞C, mean: 35.9 ¬∞C and std: 0.84
This indicates that measurements are within the normal range and do not vary significantly

In [None]:
plt.hist(spo, bins=60, edgecolor="black")
plt.xlabel("SpO‚ÇÇ %")
plt.show()

In [None]:
plt.hist(hr, bins=60, edgecolor="black")
plt.xlabel("HR bpm")
plt.show()

In [None]:
plt.hist(pi, bins=60, edgecolor="black")
plt.xlabel("PI %")
plt.show()

In [None]:
plt.hist(rr, bins=60, edgecolor="black")
plt.xlabel("RR")
plt.show()

In [None]:
plt.boxplot(prv)
plt.ylabel("PRV ms")
plt.show()

Pulse rate variability are in a range 20-200ms, mean: 117.62, std: 21.83
This indicates that the average value is within the normal range, but some values are widely scattered

In [None]:
plt.hist(bp, bins=60, edgecolor="black")
plt.xlabel("BP")
plt.show()

In [None]:
plt.hist(pvi, bins=60, edgecolor="black")
plt.xlabel("pvi")
plt.show()

In [None]:
plt.hist(sv, bins=60, edgecolor="black")
plt.xlabel("sv")
plt.show()

In [None]:
plt.hist(co, bins=60, edgecolor="black")
plt.xlabel("co")
plt.show()

## (–°-1)

In [None]:
print(patient.columns)

In [None]:
numeric = observation.select_dtypes(include=['number'])

corr = numeric.corr()

plt.figure(figsize=(14,10))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Korelaƒçn√° matica fyziologick√Ωch atrib√∫tov (observation.csv)")
plt.show()

### Interpret√°cia v√Ωsledkov p√°rovej anal√Ωzy d√°t

Z korelaƒçnej matice (obr√°zok vy≈°≈°ie) je mo≈æn√© identifikova≈• viacero v√Ωznamn√Ωch vz≈•ahov medzi fyziologick√Ωmi atrib√∫tmi:

- **CO a HR (r = 0.76)** ‚Äì veƒæmi siln√° pozit√≠vna korel√°cia, ktor√° zodpoved√° oƒçak√°vanej z√°vislosti medzi srdcovou frekvenciou a srdcov√Ωm v√Ωdajom.  
- **PVI a Blood Flow Index (r = 0.67)** ‚Äì siln√Ω vz≈•ah medzi variabilitou perf√∫zie a prietokom krvi.  
- **Oximetry a PVI (r = 0.67)** ‚Äì satur√°cia kysl√≠kom √∫zko s√∫vis√≠ s variabilitou perf√∫zie.  
- **Skin Temperature a PI (r = ‚àí0.49)** ‚Äì negat√≠vna korel√°cia; s rast√∫cou teplotou ko≈æe kles√° perf√∫zny index.  
- **Skin Temperature a Oximetry (r = 0.37)** ‚Äì mierna pozit√≠vna z√°vislos≈•, naznaƒçuje mo≈æn√© prepojenie medzi perif√©rnou teplotou a satur√°ciou.  
- V√§ƒç≈°ina ostatn√Ωch atrib√∫tov (napr. `latitude`, `longitude`, `SNR`, `PRV`) nevykazuje ≈°tatisticky v√Ωznamn√© line√°rne vz≈•ahy.

Tieto zistenia poukazuj√∫ na fyziologick√© s√∫vislosti medzi vybran√Ωmi premenn√Ωmi
a pom√°haj√∫ urƒçi≈•, ktor√© atrib√∫ty m√¥≈æu by≈• relevantn√© pri bud√∫com modelovan√≠
a predikcii cieƒæovej premennej `oximetry`.


In [None]:
corr_pairs = corr.unstack().sort_values(ascending=False)
corr_pairs = corr_pairs[(corr_pairs < 0.999) & (corr_pairs > -0.999)]
print("üîù Top 10 korel√°ci√≠ medzi atrib√∫tmi:\n")
print(corr_pairs.head(10))

In [None]:
pairs_to_plot = [
    ("CO", "HR"),
    ("oximetry", "PVI"),
    ("Skin Temperature", "oximetry"),
]

for x, y in pairs_to_plot:
    plt.figure(figsize=(6, 4))
    sns.scatterplot(data=observation, x=x, y=y, alpha=0.6)
    plt.title(f"Vz≈•ah medzi {x} a {y}")
    plt.xlabel(x)
    plt.ylabel(y)
    plt.show()

### Interpret√°cia v√Ωsledkov

Top 10 korel√°ci√≠ ukazuje, ≈æe:

- **CO a HR (r = 0.76)** ‚Äì siln√° line√°rna z√°vislos≈•, vy≈°≈°ia srdcov√° frekvencia znamen√° v√§ƒç≈°√≠ srdcov√Ω v√Ωdaj.  
- **Oximetry a PVI (r = 0.67)** ‚Äì satur√°cia kysl√≠kom √∫zko s√∫vis√≠ s variabilitou perf√∫zie.  
- **Skin Temperature a Oximetry (r = 0.37)** ‚Äì mierna pozit√≠vna korel√°cia, naznaƒçuje vplyv perif√©rnej teploty na satur√°ciu.  
- **EtCO‚ÇÇ a PI (r = 0.31)** ‚Äì slab≈°√≠, ale viditeƒæn√Ω pozit√≠vny vz≈•ah.  

Tieto v√Ωsledky naznaƒçuj√∫, ktor√© premenn√© m√¥≈æu ma≈• najv√§ƒç≈°√≠ v√Ωznam pri bud√∫cej predikcii
cieƒæovej premennej `oximetry`.


## (D-1b)

In [None]:
corr = observation.corr(numeric_only=False)
corr['oximetry'].sort_values()

In [None]:
sns.scatterplot(data=observation, x='PVI', y='oximetry')
plt.xlabel("PVI %")
plt.ylabel("oximetry")

In [None]:
sns.scatterplot(data=observation, x='Skin Temperature', y='oximetry')
plt.xlabel("Skin Temperature ¬∞C")
plt.ylabel("oximetry")

In [None]:
sns.scatterplot(data=observation, x='EtCO‚ÇÇ', y='oximetry')
plt.xlabel("EtCO‚ÇÇ mmHg")
plt.ylabel("oximetry")

## (A-2b)