In [1]:
import numpy as np
from pathlib import Path
import polars as pl
import sys

# current working directory
cwd = Path.cwd()

# parent directory (where you expect 'src/' might live)
parent_dir = cwd.parent

# Convert to string before appending to sys.path
if str(parent_dir) not in sys.path:
    sys.path.append(str(parent_dir))


from src.models.bayes_model.bayesian_data import compute_frequency_means, compute_frequency_ear_means, get_covariance_matrix, get_covariance_ear_matrix

In [2]:
data_path = Path("../data/anonymized_cleaned_data.parquet")

df = pl.read_parquet(
    data_path,  
    )
print(df.shape)

print(df.head(10))


(1021671, 6)
shape: (10, 6)
┌────────────────┬───────┬────────┬───────────┬─────┬───────┐
│ Audiogram_Date ┆ Value ┆ Region ┆ Frequency ┆ Ear ┆ pt_ID │
│ ---            ┆ ---   ┆ ---    ┆ ---       ┆ --- ┆ ---   │
│ date           ┆ i32   ┆ str    ┆ i32       ┆ str ┆ str   │
╞════════════════╪═══════╪════════╪═══════════╪═════╪═══════╡
│ 2019-01-18     ┆ 65    ┆ AUL    ┆ 250       ┆ L   ┆ 00001 │
│ 2019-01-18     ┆ 60    ┆ AUL    ┆ 500       ┆ L   ┆ 00001 │
│ 2019-01-18     ┆ 65    ┆ AUL    ┆ 1000      ┆ L   ┆ 00001 │
│ 2019-01-18     ┆ 55    ┆ AUL    ┆ 2000      ┆ L   ┆ 00001 │
│ 2019-01-18     ┆ 65    ┆ AUL    ┆ 4000      ┆ L   ┆ 00001 │
│ 2019-01-18     ┆ 65    ┆ AUL    ┆ 8000      ┆ L   ┆ 00001 │
│ 2019-01-18     ┆ 60    ┆ AUR    ┆ 250       ┆ R   ┆ 00001 │
│ 2019-01-18     ┆ 60    ┆ AUR    ┆ 500       ┆ R   ┆ 00001 │
│ 2019-01-18     ┆ 60    ┆ AUR    ┆ 1000      ┆ R   ┆ 00001 │
│ 2019-01-18     ┆ 50    ┆ AUR    ┆ 2000      ┆ R   ┆ 00001 │
└────────────────┴───────┴────────┴───────

In [3]:
# min, max, mean, std, median, 25%, 50%, 75% for Value by Frequency
summary = (
    df.group_by(["Frequency"])
    .agg(
        pl.min("Value").alias("min"),
        pl.max("Value").alias("max"),
        pl.mean("Value").alias("mean"),
        pl.std("Value").alias("std"),
        pl.median("Value").alias("median"),
    ).sort("std")
)

summary

Frequency,min,max,mean,std,median
i32,i32,i32,f64,f64,f64
250,-10,115,17.229425,17.740259,10.0
500,-10,120,22.412403,19.092871,15.0
750,-10,120,27.960675,20.007998,25.0
1000,-10,120,23.428288,21.205845,15.0
1500,-10,120,32.177011,21.918067,30.0
…,…,…,…,…,…
3000,-10,120,36.134339,24.190959,35.0
12000,-5,100,36.75876,24.388052,30.0
4000,-10,120,35.707011,25.820671,35.0
6000,-10,120,47.509009,27.917762,45.0


In [4]:
# df of the means of the frequencies
df_means = (
    df.group_by("Frequency")
    .agg(
        pl.col("Value").mean().alias("Mean"),
    )
    .sort("Frequency")
)

df_means.head(10)

Frequency,Mean
i32,f64
250,17.229425
500,22.412403
750,27.960675
1000,23.428288
1500,32.177011
2000,29.765317
3000,36.134339
4000,35.707011
6000,47.509009
8000,47.485036


In [5]:
means = compute_frequency_means(df)

print(means)

[17.22942487 22.41240264 23.4282879  29.76531678 35.70701131 47.48503616]


In [6]:
# df of the means of the frequencies by ear
df_means = (
    df.group_by(["Frequency", "Ear"])
    .agg(
        pl.col("Value").mean().alias("Mean"),
    )
    .sort("Frequency")
)

df_means.head(10)

Frequency,Ear,Mean
i32,str,f64
250,"""L""",17.423064
250,"""R""",17.03572
500,"""R""",22.207113
500,"""L""",22.617829
750,"""L""",28.208511
750,"""R""",27.708976
1000,"""R""",23.184485
1000,"""L""",23.672023
1500,"""R""",31.752519
1500,"""L""",32.591616


In [7]:
means_ear = compute_frequency_ear_means(df)

print(means_ear)

[17.42306374 17.03571961 22.61782892 22.20711297 23.67202301 23.18448536
 30.33091908 29.19593821 36.46905178 34.93554267 48.15445128 46.81536024]


In [8]:
cov_matrix = get_covariance_matrix(df)

In [9]:
cov_matrix

array([[650.25072456, 294.19446915, 227.16872779, 673.60512091,
        371.39169988, 509.77115608],
       [294.19446915, 319.56538467, 258.663569  , 326.02268479,
        327.31899996, 321.71942754],
       [227.16872779, 258.663569  , 251.57386271, 246.99178732,
        261.54783011, 249.16179364],
       [673.60512091, 326.02268479, 246.99178732, 940.35360749,
        399.03256286, 539.38661648],
       [371.39169988, 327.31899996, 261.54783011, 399.03256286,
        402.67815564, 405.18339948],
       [509.77115608, 321.71942754, 249.16179364, 539.38661648,
        405.18339948, 533.69703649]])

In [10]:
cov_ear_matrix = get_covariance_ear_matrix(df)
cov_ear_matrix

array([[247.79255253, 135.93165188, 255.29611162, 143.33840997,
        257.61238518, 149.68553262, 244.45669881, 148.66874321,
        221.73673292, 142.82137388, 241.92502998, 159.63394045],
       [135.93165188, 245.73380136, 143.46382652, 251.87582018,
        150.15060514, 254.23214294, 148.34296727, 243.06245539,
        139.61681162, 222.24777055, 153.0947855 , 243.78270035],
       [255.29611162, 143.46382652, 316.53634154, 186.09202244,
        324.2051147 , 199.33804325, 317.4506557 , 207.38749405,
        288.90872998, 198.92857034, 320.88131799, 227.21472316],
       [143.33840997, 251.87582018, 186.09202244, 311.33405643,
        200.51773173, 317.59936662, 207.16526396, 313.64398321,
        195.70721676, 287.79426441, 220.54180292, 322.17889044],
       [257.61238518, 150.15060514, 324.2051147 , 200.51773173,
        401.07029851, 257.01307254, 402.03762624, 277.08718007,
        366.8168312 , 266.08316877, 394.47993436, 292.97529785],
       [149.68553262, 254.23214294,

In [11]:
is_symmetric = np.allclose(cov_matrix, cov_matrix.T)
print("Matrix Symmetric:", is_symmetric)

Matrix Symmetric: True


In [15]:
# check if a number is even including 0
0 % 2 == 0

True