In [0]:
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import yaml
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession

# Custom funcs
from data_pull.loaders import (
    load_user_table,
    load_task_complete_table,
    load_respondent_info_table,
    load_task_table,
    load_all_wonky_studies
)
from data_pull.joiners import (
    join_user_task_respondent,
    join_wonky_balance_with_task,
    merge_wonky_data_with_user_info
)
from data_pull.aggregators import (
    enrich_user_info_with_task_counts,
    union_wonky_study_dataframes,
    aggregate_wonky_respondent_summary,
    create_wonky_respondent_summary,
    calculate_wonky_task_ratio
)

# Configs
with open('../configs/data_paths.yaml', 'r') as f:
    paths_config = yaml.safe_load(f)

with open('../configs/wonky_studies.yaml', 'r') as f:
    wonky_config = yaml.safe_load(f)

print("Imports and configs loaded successfully")


In [0]:
spark.catalog.clearCache()
print("Cache cleared")

In [0]:
s_u = load_user_table(
    spark,
    paths_config['silver_path'],
    country=paths_config['filters']['country']
)

s_tc_odr = load_task_complete_table(
    spark,
    paths_config['silver_path'],
    min_date=paths_config['filters']['min_date'],
    task_origin=paths_config['filters']['task_origin']
)

s_ri = load_respondent_info_table(
    spark,
    paths_config['silver_path'],
    country=paths_config['filters']['country']
)

print(f"s_u (users {paths_config['filters']['country']}) count: {s_u.count():,}")
print(f"s_tc_odr (tasks completed >= {paths_config['filters']['min_date']}, origin={paths_config['filters']['task_origin']}) count: {s_tc_odr.count():,}")
print(f"s_ri (respondent_info {paths_config['filters']['country']}) count: {s_ri.count():,}")


In [0]:
user_info = join_user_task_respondent(s_u, s_tc_odr, s_ri)

print(f"\nAfter INNER joins count: {user_info.count():,}")
print(f"Unique respondents in joined data: {user_info.select('respondentPk').distinct().count():,}")

In [0]:
user_info_enriched = enrich_user_info_with_task_counts(user_info)
user_info_enriched.cache()

print(f"Task-level records (rows): {user_info_enriched.count():,}")
print(f"Unique respondents: {user_info_enriched.select('respondentPk').distinct().count():,}")

In [0]:
user_info_df = user_info_enriched.toPandas()

print(f"\nHierarchical Structure Validation:")
print(f"  - Task-level rows: {len(user_info_df):,}")
print(f"  - Unique respondents: {user_info_df['respondentPk'].nunique():,}")
print(f"  - Avg tasks per respondent: {len(user_info_df) / user_info_df['respondentPk'].nunique():.2f}")

In [0]:
balance_dfs, failed_uuids = load_all_wonky_studies(
    spark,
    wonky_config['wonky_study_uuids'],
    base_path=paths_config['project_repository_path'],
    cols_to_include=wonky_config['cols_to_include_subset'],
    verbose=True
)

print(f"\nSuccessfully loaded {len(balance_dfs)} out of {len(wonky_config['wonky_study_uuids'])} studies")
if failed_uuids:
    print(f"Failed UUIDs ({len(failed_uuids)}): {failed_uuids}")


In [0]:
task = load_task_table(spark, paths_config['silver_path'])

wonky_dfs_joined = []
for balance_df in balance_dfs:
    joined = join_wonky_balance_with_task(balance_df, task)
    wonky_dfs_joined.append(joined)

wonky_spark = union_wonky_study_dataframes(wonky_dfs_joined)
wonky_map = wonky_spark.toPandas()

print(f"Wonky map shape: {wonky_map.shape}")
print(f"Unique respondents in wonky studies: {wonky_map['respondent_pk'].nunique():,}")
print(f"Unique tasks in wonky studies: {wonky_map['task_pk'].nunique():,}")


In [0]:
wonky_respondent_summary = create_wonky_respondent_summary(
    wonky_map,
    respondent_id_col="respondent_pk"
)

task_completed = (
    user_info_df[['respondentPk', 'taskPk']]
    .groupby('respondentPk')
    .count()
    .rename(columns={'taskPk': 'task_completed'})
    .reset_index()
)

wonky_counts = calculate_wonky_task_ratio(
    task_completed,
    wonky_respondent_summary
)

print(f"Wonky counts final shape: {wonky_counts.shape}")
print(f"Respondents in wonky_counts: {len(wonky_counts):,}")
print(f"Avg total tasks per respondent: {wonky_counts['task_completed'].mean():.2f}")
print(f"Avg wonky tasks per respondent: {wonky_counts['wonky_task_instances'].mean():.2f}")
print(f"Avg wonky task ratio: {wonky_counts['wonky_task_ratio'].mean():.2%}")


In [0]:
# Merge wonky data with user info
user_info_df_vstudy = merge_wonky_data_with_user_info(
    user_info_df,
    wonky_respondent_summary,
    left_on=["respondentPk", "taskPk"],
    right_on=["balance_respondentPk", "task_pk"]
)

# Add wonky flag
wonky_respondent_list = wonky_counts['respondentPk'].unique().tolist()
user_info_df_vstudy["wonky_study_flag"] = np.where(
    user_info_df_vstudy["respondentPk"].isin(wonky_respondent_list), 1, 0
)

print(f"Final user_info_df_vstudy shape: {user_info_df_vstudy.shape}")
print(f"Records with wonky flag: {(user_info_df_vstudy['wonky_study_flag']==1).sum():,}")


In [0]:
output_local

In [0]:
output_path = paths_config['output_files']['user_info_df']
wonky_path = paths_config['output_files']['wonky_user_counts']

# Convert to dbfs local for pandas
output_local = output_path.replace("dbfs:", "/dbfs")
wonky_local = wonky_path.replace("dbfs:", "/dbfs")

os.makedirs(os.path.dirname(output_local), exist_ok=True)

wonky_counts.to_parquet(wonky_path, index=False)
user_info_df_vstudy.to_parquet(output_path, index=False)

print(f"âœ“ Files saved successfully:")
print(f"  - {output_path}")
print(f"  - {wonky_path}")

In [0]:
notebook_path = os.getcwd() 
repo_root = os.path.abspath(os.path.join(notebook_path, ".."))
misc_dir = os.path.join(repo_root, "misc")
os.makedirs(misc_dir, exist_ok=True)

output_path = os.path.join(misc_dir,
                           os.path.basename(paths_config['output_files']['user_info_df']))
wonky_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_user_counts']))

wonky_counts.to_parquet(wonky_path, index=False)

user_info_df_vstudy = user_info_df_vstudy.loc[:, ~user_info_df_vstudy.columns.duplicated()]
user_info_df_vstudy.to_parquet(output_path, index=False)

print("Files saved successfully:")
print(f"  - {output_path}")
print(f"  - {wonky_path}")