In [0]:
# IMPORTS
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import yaml
import pandas as pd
import numpy as np

from typing import List
from pyspark.sql import SparkSession

import plotly.express as px

# Custom funcs
from data_pull.loaders import (
    load_user_table,
    load_task_complete_table,
    load_respondent_info_table,
    load_task_table,
    load_all_wonky_studies
)
from data_pull.joiners import (
    join_user_task_respondent,
    join_wonky_balance_with_task,
    merge_wonky_data_with_user_info
)
from data_pull.aggregators import (
    enrich_user_info_with_task_counts,
    union_wonky_study_dataframes,
    aggregate_map_to_wonky_respondent_df,
    create_wonky_respondent_summary,
    calculate_wonky_task_ratio
)

# Configs
with open('../configs/data_paths.yaml', 'r') as f:
    paths_config = yaml.safe_load(f)

with open('../configs/wonky_studies.yaml', 'r') as f:
    wonky_config = yaml.safe_load(f)

print("Imports and configs loaded successfully")

In [0]:
pull_date = paths_config['filters']['min_date']

In [0]:
# needed on each re-run
try:    spark.catalog.clearCache()
except Exception as e:
    print(f"Could not clear cache: {e}")

print("Cache cleared")

In [0]:
s_u = load_user_table(
    spark,
    paths_config['silver_path'],
    country=paths_config['filters']['country']
)

s_tc_odr = load_task_complete_table(
    spark,
    paths_config['silver_path'],
    min_date=paths_config['filters']['min_date'],
    task_origin=paths_config['filters']['task_origin']
)

s_ri = load_respondent_info_table(
    spark,
    paths_config['silver_path'],
    country=paths_config['filters']['country']
)

print(f"s_u (users {paths_config['filters']['country']}) count: {s_u.count():,}")
print(f"s_tc_odr (tasks completed >= {paths_config['filters']['min_date']}, origin={paths_config['filters']['task_origin']}) count: {s_tc_odr.count():,}")
print(f"s_ri (respondent_info {paths_config['filters']['country']}) count: {s_ri.count():,}")

In [0]:
su = s_u.toPandas()
tc = s_tc_odr.toPandas()
ri = s_ri.toPandas()

print(f"\ns_u unique respondent_pk: {s_u.select('respondent_pk').distinct().count():,}")
print(f"s_ri unique respondent_pk: {s_ri.select('respondent_pk').distinct().count():,}")

print(f"s_u (users GB) count: {s_u.count():,}")
print(f"s_tc_odr (tasks completed >= {pull_date}, origin=odr) count: {s_tc_odr.count():,}")
print(f"s_ri (respondent_info GB) count: {s_ri.count():,}")

In [0]:
# join and print
user_info = join_user_task_respondent(s_u, s_tc_odr, s_ri)

print(f"\nAfter INNER joins count: {user_info.count():,}")
print(f"Unique respondents in joined data: {user_info.select('respondentPk').distinct().count():,}")

In [0]:
user_info_enriched = enrich_user_info_with_task_counts(user_info)
user_info_df = user_info_enriched.toPandas()

print(f"Task-level records (rows): {user_info_enriched.count():,}")
print(f"Unique respondents: {user_info_enriched.select('respondentPk').distinct().count():,}")

print(f"\nHierarchical Structure Validation:")
print(f"  - Task-level rows: {len(user_info_df):,}")
print(f"  - Unique respondents: {user_info_df['respondentPk'].nunique():,}")
print(f"  - Avg tasks per respondent: {len(user_info_df) / user_info_df['respondentPk'].nunique():.2f}")

In [0]:
# join wonky studies with task table & mergeing w user_info_df
task = load_task_table(spark, paths_config['silver_path'])
tasks = task.toPandas()

user_info_df = user_info_df.merge(tasks[['task_pk', 'task_length_of_task']], how='left', left_on=['taskPk'], right_on=['task_pk'])

In [0]:
def process_wonky_studies(spark, uuids, task, base_path, cols_to_include, wonky_study_count=0):
    """Load, join, and process wonky studies in one pass."""
    # Load studies
    balance_dfs, processed_uuids = load_all_wonky_studies(
        spark,
        uuids,
        base_path=base_path,
        cols_to_include=cols_to_include,
        verbose=True
    )
    
    print(f"\nSuccessfully loaded {len(balance_dfs)} out of {len(uuids)} studies")
    if processed_uuids:
        print(f"UUIDs ({len(processed_uuids)}): {processed_uuids}")
    
    # Join and union in one step
    wonky_dfs_joined = [join_wonky_balance_with_task(df, task) for df in balance_dfs]
    wonky_spark = union_wonky_study_dataframes(wonky_dfs_joined)
    wonky_map = wonky_spark.toPandas()
    wonky_map['wonky_study_count'] = wonky_study_count
    
    print(f"Wonky map shape: {wonky_map.shape}")
    print(f"Unique respondents: {wonky_map['respondent_pk'].nunique():,}")
    print(f"Unique tasks: {wonky_map['task_pk'].nunique():,}")
    
    return wonky_map


# Extract UUIDs
uuids_true = [uuid for uuid, is_wonky in wonky_config["wonky_study_uuids"].items() if is_wonky]
uuids_false = [uuid for uuid, is_wonky in wonky_config["wonky_study_uuids"].items() if not is_wonky]

# Process both groups
wonky_map_T = process_wonky_studies(
    spark, uuids_true, task, 
    paths_config['project_repository_path'],
    wonky_config['cols_to_include_subset'],
    wonky_study_count=1
)

wonky_map_F = process_wonky_studies(
    spark, uuids_false, task,
    paths_config['project_repository_path'],
    wonky_config['cols_to_include_subset'],
    wonky_study_count=0
)

# Concat
wonky_map = pd.concat([wonky_map_T, wonky_map_F], ignore_index=True)

In [0]:

user_info_df[[col for col in user_info_df.columns if 'date' in col]]

### Aggregation

In [0]:
# get respondent level df from mapped balance tables at task level
wonky_respondent_df = (
    wonky_map.groupby(wonky_config['cols_to_group'])
    .agg({'uuid': 'count',
          'wonky_study_count': 'sum'}) 
    .reset_index()
    .rename(columns={"uuid": "balance_study_count", "respondent_pk": "balance_respondentPk"})
)

wonky_respondent_df = wonky_respondent_df[
    [col for col in wonky_respondent_df.columns if col not in wonky_config['cols_to_drop']]
]

wonky_respondent_df.display()

respondent df has counts..... use that in 

In [0]:
# not priority dataframe used in one or two places. less important now
categorical_cols = ['survey_pk', 'platform_name', 'hardware_version', 
                    'yob', 'survey_locale', 'exposure_band']

wonky_respondent_summary = create_wonky_respondent_summary(
    wonky_respondent_df,
    respondent_id_col="balance_respondentPk",
    categorical_cols=categorical_cols
)

wonky_respondent_summary.display()

In [0]:
user_info_df_vstudy = merge_wonky_data_with_user_info(
    user_info_df,
    wonky_respondent_df,
)

user_info_df_vstudy = user_info_df_vstudy.sort_index(axis=1)

print(f"Final user_info_df_vstudy shape: {user_info_df_vstudy.shape}")

merge_stats = user_info_df_vstudy['_merge'].value_counts()

print(f"Total task-level records: {len(user_info_df_vstudy):,}")
print(f"Unique respondents: {user_info_df_vstudy['respondentPk'].nunique():,}")
print(f"Unique tasks: {user_info_df_vstudy['taskPk'].nunique():,}")
print(f"UserxTask that have wonky task: {(user_info_df_vstudy['wonky_study_count']>0).sum():,}")

user_info_df_vstudy = user_info_df_vstudy.drop(columns=['_merge'])

print(f"Final user_info_df_vstudy shape: {user_info_df_vstudy.shape}")
print("="*80)

In [0]:
user_info_df_vstudy[~user_info_df_vstudy['wonky_study_count'].isna()]

column wonky study flag is broken. should be balance_study_flag

In [0]:
sorted(user_info_df_vstudy.columns)

In [0]:
(user_info_df_vstudy.columns)

In [0]:
print(f"\nTask time validation:")
print(f"  - Null task_time_taken_s: {user_info_df['task_time_taken_s'].isnull().sum()}")
print(f"  - Avg task time: {user_info_df['task_time_taken_s'].mean():.2f} seconds")

In [0]:
print(f"Wonky map shape: {wonky_map.shape}")
print(f"Unique respondents in wonky studies: {wonky_map['respondent_pk'].nunique():,}")
print(f"Unique tasks in wonky studies: {wonky_map['task_pk'].nunique():,}")

In [0]:
[col for col in user_info_df_vstudy.columns if 'Pk' in col]

In [0]:
# user_info_df_vstudy[user_info_df_vstudy['wonky_study_flag'] == 1].to_csv('export.csv')

#### Print checks

In [0]:
print(f"Min time: {user_info_df_vstudy['task_time_taken_s'].min():.2f}s")
print(f"Max time: {user_info_df_vstudy['task_time_taken_s'].max():.2f}s")
print(f"Median time: {user_info_df_vstudy['task_time_taken_s'].median():.2f}s")

In [0]:
print("="*80)
print("FINAL DATA EXPORT SUMMARY")
print("="*80)

print(f"Shape: {user_info_df_vstudy.shape}")
print(f"Task-level records: {len(user_info_df_vstudy):,}")
print(f"Unique respondents: {user_info_df_vstudy['respondentPk'].nunique():,}")
print(f"Unique tasks: {user_info_df_vstudy['taskPk'].nunique():,}")
print(f"Records with wonky details: {(user_info_df_vstudy['wonky_study_count']>0).sum():,}")


In [0]:
wonky_respondent_summary = wonky_respondent_summary.merge(
    user_info_df_vstudy[["respondentPk", "task_completed"]].drop_duplicates(),
    left_on=["balance_respondentPk"],
    right_on=["respondentPk"],
    how="left",
)

### Tables and Charting

In [0]:
(wonky_respondent_df[['balance_respondentPk', 'task_targeting_type', 'exposure_band']].groupby(['task_targeting_type', 'exposure_band']).count()/len(wonky_respondent_df)).reset_index().rename({'balance_respondentPk': 'share_of_users(%)'}, axis=1).sort_values('share_of_users(%)', ascending=False)

In [0]:
wonky_respondent_summary

In [0]:
summary1 = wonky_respondent_summary[['exposure_band', 'task_completed', 'total_wonky_studies']].groupby(['exposure_band']).sum().reset_index()
summary1['wonky_task_ratio'] = summary1['total_wonky_studies'] / summary1['task_completed']

summary1['wonky_task_share(%)'] = summary1['total_wonky_studies'] / len(wonky_respondent_df)
summary1['task_completed(%)'] = summary1['task_completed'] / summary1['task_completed'].sum()

summary1

In [0]:
# table >> respondent & tasks completed
user_info_df[['respondentPk', 'task_completed']].groupby('respondentPk').sum().sort_values(by='task_completed')

In [0]:
task_completed = (
    user_info_df[['respondentPk', 'taskPk']]
    )

task_completed = task_completed.merge(wonky_map[['respondent_pk', 'task_pk', 'wonky_study_count']], how='left', left_on=['respondentPk', 'taskPk'], right_on=['respondent_pk', 'task_pk'])

task_completed = task_completed.drop(['respondent_pk', 'task_pk'], axis=1).fillna(0)

task_completed = task_completed.groupby(["respondentPk"]).agg({'taskPk':'count',
                                           'wonky_study_count': 'sum'}).reset_index().rename({'taskPk': 'tasks_completed',
                                                                                       'respondentPk': 'respondentPk_tc'}, axis=1).reset_index()



fig = px.histogram(task_completed, x="tasks_completed", nbins=120, histnorm='percent')
fig.update_layout(
    xaxis_title=f"Tasks completed by users since {pull_date}",
    yaxis_title="% of users completing tasks",
    bargap=0.05
)
fig.show()


task_completed.display()

In [0]:
pd.DataFrame(task_completed.value_counts()/len(task_completed)).rename({0: 'percent'}, axis=1)

In [0]:
pd.DataFrame(task_completed.value_counts()/len(task_completed)).renambe({0: 'percent'}, axis=1).cumsum().sort_values('tasks_completed', ascending=False).head(500).reset_index().display()

In [0]:
break

In [0]:
pd.DataFrame(task_completed.value_counts()/len(task_completed)).rename({0: 'percent'}, axis=1).cumsum().sort_values('tasks_completed').cumsum()

In [0]:
df = task_completed.copy()

# Create bin edges from 0 to max
nbins = 120
max_x = df["tasks_completed"].max()
bins = np.linspace(0, max_x, nbins + 1)

df["bin"] = pd.cut(df["tasks_completed"], bins=bins, include_lowest=True)

# Rest of your code stays the same
agg = (
    df.groupby("bin")
      .agg(
          count=("tasks_completed", "size"),
          wonky_any=("wonky_study_count", "max"),
      )
      .reset_index()
)

total = agg["count"].sum()
agg["percent"] = agg["count"] / total * 100
agg["bin_center"] = agg["bin"].apply(lambda x: x.mid)

fig = px.bar(
    agg,
    x="bin_center",
    y="percent",
    color="wonky_any",
    color_discrete_map={0: "#c4b96f", 1: "#ff4b4b"},
)

fig.update_layout(
    xaxis_title=f"Tasks completed by users since {pull_date}",
    yaxis_title="% of users completing tasks",
    bargap=0.05,
    showlegend=True,
    legend_title_text="Contains wonky study"
)

fig.show()

In [0]:
# Examples
wonky_map[wonky_map['respondent_pk'] == 'c9f6f409-ec40-498c-a029-2a4510c953fb']

user_info_df[(user_info_df['respondentPk'] == 'c9f6f409-ec40-498c-a029-2a4510c953fb')]

# 1 respondent 108 tasks 6 wonky (all exposed)

In [0]:
# use total tasks completed with wonky summary to calc wonky rates for wonky_counts

wonky_counts = calculate_wonky_task_ratio(
    task_completed,
    wonky_respondent_summary,
    task_completed_col='tasks_completed',
    wonky_instances_col='total_wonky_studies'
)

wonky_counts['wonky_task_ratio_cap'] = np.where(wonky_counts['wonky_task_ratio'] > 1, 1, wonky_counts['wonky_task_ratio'])

print(f"Aggregated (user level) final shape: {wonky_counts.shape} from balance")
print(f"Respondents in wonky_counts: {len(wonky_counts):,} from balance")
print(f"Avg total tasks per respondent: {wonky_counts['tasks_completed'].mean():.2f} in balance")
print(f"Avg wonky tasks per respondent: {wonky_counts['wonky_study_count'].mean():.2f} in balance")
print(f"Avg wonky task ratio: {wonky_counts['wonky_task_ratio'].mean():.2%} in balance")


In [0]:
print(f"Shape: {wonky_counts.shape}")
print(f"Unique respondents: {len(wonky_counts):,}")
print(f"Columns: {list(wonky_counts.columns)}")

In [0]:
# special_user1 = 
user_info_df.columns

In [0]:
# respondent & task deep dive
# wonky_map[wonky_map['respondent_pk']  == '65e2c4dc-097c-4088-a43d-0578ebab6359'].display()

# user_info_df_vstudy[(user_info_df_vstudy['taskPk'] == '0e91fff6-05eb-4320-a53b-c260ff053547') & (user_info_df_vstudy['respondentPk']  == '65e2c4dc-097c-4088-a43d-0578ebab6359')][['date_completed', 'wonky_study_count']].display()

special_user = user_info_df[user_info_df['respondentPk']  == 'c9f6f409-ec40-498c-a029-2a4510c953fb']
special_user['date_completed'] = pd.to_datetime(special_user['date_completed'], format='%Y-%m-%d').dt.date

special_user_chartdata = special_user[['date_completed', 'taskPk']].groupby('date_completed').count().reset_index()

In [0]:
wonky_map[wonky_map['respondent_pk']  == '361d4dfc-97e8-439a-b969-36163260ea4b'].display()

user_info_df_vstudy[(user_info_df_vstudy['taskPk'] == '0e91fff6-05eb-4320-a53b-c260ff053547') & (user_info_df['respondentPk']  == '65e2c4dc-097c-4088-a43d-0578ebab6359')][['date_completed', 'wonky_study_count']].display()

special_user = user_info_df[user_info_df['respondentPk']  == '361d4dfc-97e8-439a-b969-36163260ea4b']
special_user['date_completed'] = pd.to_datetime(special_user['date_completed'], format='%Y-%m-%d').dt.date

special_user_chartdata = special_user[['date_completed', 'taskPk']].groupby('date_completed').count().reset_index()

fig = px.line(
    special_user_chartdata,
    x="date_completed",
    y="taskPk",
    markers=True  # optional: show markers on each date
)

fig.update_layout(
    xaxis_title="Date completed",
    yaxis_title="Number of tasks",
)

fig.show()

In [0]:
a = wonky_map[wonky_map['respondent_pk'] == '361d4dfc-97e8-439a-b969-36163260ea4b']

user_info_df_vstudy[(user_info_df_vstudy['respondentPk'] == '361d4dfc-97e8-439a-b969-36163260ea4b') & (user_info_df_vstudy['wonky_study_count'] > 0)]

In [0]:

user_info_df_vstudy = user_info_df_vstudy.merge(task_completed[['respondentPk_tc', 'tasks_completed']], how='left', left_on='respondentPk', right_on='respondentPk_tc').rename({'tasks_completed': 'totaal_tasks_completed'}, axis=1)

#### Special User review

In [0]:
pk = 'c9f6f409-ec40-498c-a029-2a4510c953fb'

user_info_df_vstudy[(user_info_df_vstudy['respondentPk'] == pk) & (user_info_df_vstudy['wonky_study_count'] > 0)][['respondentPk', 'date_completed', 'wonky_study_count']]

In [0]:
wonky_map[wonky_map['respondent_pk']  == pk].display()

user_info_df_vstudy[(user_info_df['respondentPk']  == pk)][['date_completed', 'wonky_study_count']].display()

special_user = user_info_df[user_info_df['respondentPk']  == pk]
special_user['date_completed'] = pd.to_datetime(special_user['date_completed'], format='%Y-%m-%d').dt.date

special_user_chartdata = special_user[['date_completed', 'taskPk']].groupby('date_completed').count().reset_index()

fig = px.line(
    special_user_chartdata,
    x="date_completed",
    y="taskPk",
    markers=True  # optional: show markers on each date
)

fig.update_layout(
    xaxis_title="Date completed",
    yaxis_title="Number of tasks",
)

fig.show()

In [0]:
# import os

# notebook_path = os.getcwd() 
# repo_root = os.path.abspath(os.path.join(notebook_path, ".."))
# misc_dir = os.path.join(repo_root, "misc")
# os.makedirs(misc_dir, exist_ok=True)

# output_path = os.path.join(misc_dir,
#                            os.path.basename(paths_config['output_files']['user_info_df']))
# wonky_respondent_df_path = os.path.join(misc_dir,
#                           os.path.basename(paths_config['output_files']['wonky_respondent_df']))
# wonky_respondent_summary_path = os.path.join(misc_dir,
#                           os.path.basename(paths_config['output_files']['wonky_respondent_summary']))
# wonky_map_path = os.path.join(misc_dir,
#                           os.path.basename(paths_config['output_files']['wonky_map']))                          
                          

# wonky_respondent_df.to_parquet(wonky_respondent_df_path, index=False)
# wonky_respondent_summary.to_parquet(wonky_respondent_summary_path, index=False)
# wonky_map.to_parquet(wonky_map_path, index=False)

# user_info_df_vstudy = user_info_df_vstudy.loc[:, ~user_info_df_vstudy.columns.duplicated()]
# user_info_df_vstudy.to_parquet(output_path, index=False)

# print("Files saved successfully:")
# print(f"  - {output_path}")
# print(f"  - {wonky_respondent_df_path}")
# print(f"  - {wonky_respondent_summary_path}")

In [0]:
# wonky_counts_path = os.path.join(misc_dir,
#                           os.path.basename(paths_config['output_files']['wonky_user_counts']))

# wonky_counts.to_parquet(wonky_counts_path, index=False)