In [None]:
import json
import pickle as pkl
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Volume Analysis
Find mean volume values across sexes and age groups

## Fit and save the regression model
Predicted vatriable: **Volume**
Independant: 
+ BMI
+ Sex
+ Age
+ Weight

In [None]:
def remove_outliers_zscore(df, column="volume", threshold=3, stats=None):
    """
    Removes outliers from a dataframe based on the Z-score method.

    Parameters:
    df (pd.DataFrame): The input dataframe.
    column (str): The name of the column to check for outliers.
    threshold (float): The Z-score threshold to identify outliers (default is 3).

    Returns:
    pd.DataFrame: Dataframe without outliers.
    pd.DataFrame: Dataframe containing only the outliers.
    """

    # Calculate the Z-scores for the specified column
    if stats:
        mean_value = stats['mean']
        std_value = stats['std']
    else:
        mean_value = df[column].mean()
        std_value = df[column].std()
    
    df['z_score'] = (df[column] - mean_value) / std_value

    # Identify outliers
    df_cleaned = df[np.abs(df['z_score']) <= threshold]
    df_large = df[df['z_score'] > threshold]
    df_small = df[df['z_score'] < -threshold]


    # Drop the z_score column before returning
    df_cleaned = df_cleaned.drop(columns=['z_score'])
    df_large = df_large.drop(columns=['z_score'])
    df_small = df_small.drop(columns=['z_score'])

    return df_cleaned, df_large, df_small

In [None]:
# tmp file for this job: metadata.csv from run_4_v2: data/dummy_regression.csv
df = pd.read_csv('../data/metadata/dummy_regression.csv')

# Fit the regression model
df_clean = df.dropna()
df_clean["BMI"] = df_clean["Weight"] / df_clean["Height"] ** 2;
X = df_clean[['Age', 'Sex', 'Weight', 'BMI']]
X = sm.add_constant(X)  # Add intercept
y = df_clean['volume']
model = sm.OLS(y, X).fit()
print(model.summary())

# Save regression model
with open(f'../data/regression_models/lm_volume.pkl', 'wb') as f:
    pkl.dump(model, f)

df_clean['predicted_volume'] = model.predict(X)
df_clean['residuals'] = df_clean["volume"] - df_clean['predicted_volume']

# Save stats
stats = {
    'mean': df_clean['residuals'].mean(),
    'std': df_clean['residuals'].std()
}
with open(f'../data/regression_models/lm_residual_stats.json', 'w') as f:
    json.dump(stats, f)

## Load linear model and filter data based on that

In [None]:
with open('../data/regression_models/lm_volume.pkl', 'rb') as f:
    model = pkl.load(f)
print(model.summary())

with open(f'../data/regression_models/lm_residual_stats.json', 'r') as f:
    stats = json.load(f)
# === Read data based on dataset_id
# df_clean = pd.read_csv(...)

X = df_clean[['Age', 'Sex', 'Weight', 'BMI']]
X = sm.add_constant(X) 

# filter based on volume
df_clean['predicted_volume'] = model.predict(X)
df_clean['residuals'] = df_clean["volume"] - df_clean['predicted_volume']

df_clean, df_large, df_small = remove_outliers_zscore(df_clean, column="residuals", threshold=3, stats=stats)

In [None]:
df_large

In [None]:
df_small