Notebook to make X and Y files which go through steps in 3_model_data.py without removing any columns (for use when testing e.g. politics model on crypto data).

In [31]:
import pandas as pd
import numpy as np
import os

In [32]:
Y_COL = "thread_size"

In [33]:

def log_vals(y: pd.Series):
    """
    Apply logarithmic transformation to target variable.
    
    Parameters
    ----------
    y : pd.Series
        Target variable values.
    
    Returns
    -------
    pd.Series or None
        Log-transformed values, log(y+1) if min < 1, or None if negative values.
    """
    if y.min() < 0:
        print(f"[INFO] Values are negative, not taking log.")
        return None
    elif y.min() < 1:
        print(f"[INFO] Min value < 1, taking log(y+1)")
        return np.log(y + 1)
    else:
        print(f"[INFO] Taking log(y)")
        return np.log(y)


In [34]:
outdir = "../../Outputs/3_trial_models/0_preprocessing"

os.makedirs(outdir, exist_ok=True)


In [40]:
x_test_dfs = {}
for sub in ['conspiracy', 'crypto', 'politics']:
    preprocessing_dir = f"../../Outputs/0_preprocessing/{sub}"
    data = {}
    for d in ["train", "test"]:
        data[d] = pd.read_parquet(f"{preprocessing_dir}/tf-idf/{sub}_svd_enriched_{d}_data.parquet")
        for col in ["author", "domain"]:
            freq = data["train"][col].astype(str).value_counts(normalize=True)
            for k, df in data.items():
                df[f"{col}_freq"] = (
                    df[col].astype(str).map(freq).fillna(0.0).astype("float32")
                )
    y_col = Y_COL
    for k, df in data.items():
            log_y_col = log_vals(df[Y_COL])
            if log_y_col is not None:
                df[f"log_{Y_COL}"] = log_y_col
                y_col = f"log_{Y_COL}"
    x_dfs = {}
    y_dfs = {}
    for i, df in data.items():
        x_dfs[i] = df[[x for x in df.columns if x!=y_col]]
        y_dfs[i] = df[[y_col]]
    
    x_test_dfs[sub] = x_dfs['test']
    for k, x_df in x_dfs.items():
        x_out = f"{outdir}/{sub}_{k}_X.parquet"
        y_out = f"{outdir}/{sub}_{k}_Y.parquet"
        print(f"[INFO] Saving {k} dfs to\n{x_out}\n{y_out}")
        x_df.to_parquet(x_out)
        y_dfs[k].to_parquet(y_out)

[INFO] Taking log(y)
[INFO] Taking log(y)
[INFO] Saving train dfs to
../../Outputs/3_trial_models/0_preprocessing/conspiracy_train_X.parquet
../../Outputs/3_trial_models/0_preprocessing/conspiracy_train_Y.parquet
[INFO] Saving test dfs to
../../Outputs/3_trial_models/0_preprocessing/conspiracy_test_X.parquet
../../Outputs/3_trial_models/0_preprocessing/conspiracy_test_Y.parquet
[INFO] Taking log(y)
[INFO] Taking log(y)
[INFO] Saving train dfs to
../../Outputs/3_trial_models/0_preprocessing/crypto_train_X.parquet
../../Outputs/3_trial_models/0_preprocessing/crypto_train_Y.parquet
[INFO] Saving test dfs to
../../Outputs/3_trial_models/0_preprocessing/crypto_test_X.parquet
../../Outputs/3_trial_models/0_preprocessing/crypto_test_Y.parquet
[INFO] Taking log(y)
[INFO] Taking log(y)
[INFO] Saving train dfs to
../../Outputs/3_trial_models/0_preprocessing/politics_train_X.parquet
../../Outputs/3_trial_models/0_preprocessing/politics_train_Y.parquet
[INFO] Saving test dfs to
../../Outputs/3_tri

In [41]:
[x for x in x_test_dfs['crypto'].columns if "author" in x]

['authors', 'author', 'author_freq']