In [0]:
# Import libraries
import sys
import os

sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import numpy as np
import pandas as pd
import yaml

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from eda.feature_engineering import (
    create_task_amount_features,
    create_task_temporal_features,
    create_task_speed_features,
    create_respondent_behavioral_features,
    add_wonky_features,
    create_fraud_risk_score,
    create_wonky_risk_score,
    add_rating_delta
)
from eda.statistical_tests import (
    perform_chi_square_tests,
    perform_mannwhitney_tests,
    perform_welch_ttests,
    perform_two_proportion_z_tests,
)

from eda.visualizations import (
    create_breakdown_summary,
    create_breakdown_chart,
    create_task_speed_breakdown_summary,
    create_chi_squared_bar_chart,
    create_dual_axis_statistical_chart,
    create_feature_breakdown_table,
    create_distribution_comparison,
    calculate_temporal_feature_deltas,       
    create_chi_squared_delta_dual_axis_chart,
)

# Load configs
with open('../configs/feature_engineering.yaml', 'r') as f:
    feature_config = yaml.safe_load(f)

with open('../configs/data_paths.yaml', 'r') as f:
    paths_config = yaml.safe_load(f)

pd.set_option('display.max_columns', None)
print("✓ Imports and configs loaded successfully")


### Load

In [0]:
notebook_path = os.getcwd() 
repo_root = os.path.abspath(os.path.join(notebook_path, ".."))
misc_dir = os.path.join(repo_root, "misc")

output_path = os.path.join(misc_dir,
                           os.path.basename(paths_config['output_files']['user_info_df']))
wonky_counts_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_user_counts']))
wonky_respondent_df_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_respondent_df']))
wonky_respondent_summary_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_respondent_summary']))

user_info_df = pd.read_parquet(output_path) # total user info
wonky_counts = pd.read_parquet(wonky_counts_path) # normal tasks and wonky tasks for wonky task respondents
wonky_respondent_df = pd.read_parquet(wonky_respondent_df_path) # task level info for wonky task respondents
wonky_respondent_summary = pd.read_parquet(wonky_respondent_summary_path) # summary of wonky task respondents

In [0]:
# specific user example

user_info_df[(user_info_df['respondentPk'] == '361d4dfc-97e8-439a-b969-36163260ea4b') & (user_info_df['wonky_study_count'] > 0)]['date_completed']

In [0]:
# view respondent & IP address count from wonky users
wonky_respondent_df[['balance_respondentPk', 'request-remote-addr']].groupby('balance_respondentPk').count().sort_values('request-remote-addr', ascending=False).head(10)

In [0]:
wonky_respondent_summary.display()

In [0]:
print(user_info_df.head())

print(wonky_respondent_df.head())

df = pd.DataFrame(user_info_df.isnull().sum(), columns=['null_count'])
display(df.reset_index())

print("\nwonky_studies_df - Missing values:")
missing_wonky = wonky_respondent_df.isnull().sum()
print(missing_wonky[missing_wonky > 0])

In [0]:
key_numeric_cols = ['task_time_taken_s', 'task_points', 'risk', 'quality', 'task_completed']
available_cols = [col for col in key_numeric_cols if col in user_info_df.columns]
print(user_info_df[available_cols].describe())

if 'wonky_study_flag' in user_info_df.columns:
    print("\n" + "=" * 80)
    print("COMPARISON BY wonky_study_flag (Task Level)")
    print("=" * 80)
    comparison_cols = ['task_time_taken_s', 'task_points', 'risk', 'quality']
    comparison_cols = [col for col in comparison_cols if col in user_info_df.columns]
    
    if len(comparison_cols) > 0:
        wonky_study_tasks = user_info_df[user_info_df['wonky_study_flag'] == 1]
        non_wonky_study_tasks = user_info_df[user_info_df['wonky_study_flag'] == 0]
        
        print("\nWonky Study Tasks (wonky_study_flag=1):")
        print(wonky_study_tasks[comparison_cols].describe())
        
        print("\nNon-Wonky Study Tasks (wonky_study_flag=0):")
        print(non_wonky_study_tasks[comparison_cols].describe())
        
        if 'wonky_studies_count' in user_info_df.columns:
            wonky_user_tasks = user_info_df[user_info_df['wonky_studies_count'] > 0]
            print("\nTasks from Users with Wonky Studies (wonky_studies_count > 0):")
            print(wonky_user_tasks[comparison_cols].describe())

print("\n" + "=" * 80)
print("STATISTICAL SUMMARY: wonky_studies_df")
print("=" * 80)
print(wonky_counts.describe())


### Feature Engineering & Testing

In [0]:
main_features = []

#### Task amounts - EDA

In [0]:
(
    user_info_df["totaal_tasks_completed"]
    .value_counts()
    .div(len(user_info_df))
    .sort_index()          # sort by the index (days)
    .cumsum()
    .head(10)
)


In [0]:
(
    user_info_df[user_info_df['wonky_study_count'] > 0]['totaal_tasks_completed']
    .value_counts()
    .div(len(user_info_df[user_info_df['wonky_study_count'] > 0]))
    .sort_index()          # sort by the index (days)
    .cumsum()
    .head(10)
)


In [0]:
user_info_df[['totaal_tasks_completed']]

In [0]:
user_info_df, task_labels = create_task_amount_features(user_info_df, 'totaal_tasks_completed')

In [0]:
print(
    create_breakdown_summary(
        user_info_df,
        features=task_labels,
        group_col="wonky_study_count",
        group_threshold=0,
    )
)

In [0]:
task_labels

In [0]:
fig = create_breakdown_chart(
    user_info_df,
    features=task_labels,
    group_col="wonky_study_count",
    group_threshold=0)
fig.show()

%md
#### Task amounts - Testing

In [0]:
chi_square_results_days_task = perform_chi_square_tests(
    user_info_df,
    feature_set=task_labels,
    group_var='wonky_study_count',
    significance_level=0.01
)

display(chi_square_results_days_task.reset_index())

In [0]:
ztest_results_days_task = perform_two_proportion_z_tests(
    user_info_df, task_labels, group_var="wonky_study_count"
)
# display(ztest_results_days_active[ztest_results_days_active['significant']].reset_index())
display(ztest_results_days_task.reset_index())

In [0]:
wonky_tasknum = wonky_respondent_summary.merge(user_info_df[list(task_labels) + ['respondentPk', 'gender']], left_on='respondentPk', right_on='respondentPk', how='left')

long = (
    wonky_tasknum[["respondentPk", "exposure_band", "gender"] + list(task_labels)]
      .melt(
          id_vars=["respondentPk", "exposure_band", "gender"],
          value_vars=task_labels,
          var_name="task_amounts",
          value_name="flag",
      )
)

long = long[long["flag"] == 1]

counts = (
    long
    .groupby(["task_amounts", "exposure_band", "gender"])
    .agg(n=("respondentPk", "nunique"))
    .reset_index()
)

counts["group_total"] = (
    counts
    .groupby(["exposure_band", "gender"])["n"]
    .transform("sum")
)
counts["pct"] = counts["n"] / counts["group_total"] * 100

counts.display()

In [0]:
counts

#### Days active before task - EDA

In [0]:
min_dates = user_info_df[['respondentPk', 'date_completed']].groupby('respondentPk').min().reset_index()
min_dates = min_dates.rename(columns={'date_completed': 'first_task_completed_date'})

In [0]:
user_info_df = user_info_df.merge(min_dates, on="respondentPk", how="left")
user_info_df["days_active_before_task"] = (
    user_info_df["date_completed"] - user_info_df["first_task_completed_date"]
).dt.days

In [0]:
import plotly.express as px

vc = user_info_df["days_active_before_task"].value_counts().sort_index()/len(user_info_df) * 100

vc_df = vc.reset_index()
vc_df.columns = ["days_active_before_task", "count"]

fig = px.line(
    vc_df,
    x="days_active_before_task",
    y="count",
    markers=True,
)

fig.update_layout(
    xaxis_title="Days active before task",
    yaxis_title="Number of users (%)",
    template="plotly_white"
)

fig.show()


In [0]:
import plotly.express as px

df = user_info_df[user_info_df['wonky_study_count'] > 0]

vc = df["days_active_before_task"].value_counts().sort_index()/len(df) * 100

vc_df = vc.reset_index()
vc_df.columns = ["days_active_before_task", "count"]

fig = px.line(
    vc_df,
    x="days_active_before_task",
    y="count",
    markers=True,
)

fig.update_layout(
    xaxis_title="Days active before task",
    yaxis_title="Number of wonky users (%)",
    template="plotly_white"
)

fig.show()

In [0]:
import plotly.express as px
import pandas as pd

# 1) All users
vc_all = (
    user_info_df["days_active_before_task"]
    .value_counts()
    .sort_index() / len(user_info_df) * 100
)

df_all = vc_all.reset_index()
df_all.columns = ["days_active_before_task", "percent"]
df_all["group"] = "All users"

# 2) Wonky users
wonky_df = user_info_df[user_info_df["wonky_study_count"] > 0]

vc_wonky = (
    wonky_df["days_active_before_task"]
    .value_counts()
    .sort_index() / len(wonky_df) * 100
)

df_wonky = vc_wonky.reset_index()
df_wonky.columns = ["days_active_before_task", "percent"]
df_wonky["group"] = "Wonky users"

# 3) Combine
plot_df = pd.concat([df_all, df_wonky], ignore_index=True)

# 4) Single line chart with color
fig = px.line(
    plot_df,
    x="days_active_before_task",
    y="percent",
    color="group",
    markers=True,
)

fig.update_layout(
    xaxis_title="Days active before task",
    yaxis_title="Users (%)",
    template="plotly_white",
    legend_title_text="Group",
)

fig.show()


In [0]:
(user_info_df["days_active_before_task"].value_counts() / len(user_info_df)).sort_values(ascending=False).cumsum().head(10)

In [0]:
(
    user_info_df["days_active_before_task"]
    .value_counts()
    .div(len(user_info_df))
    .sort_index()          # sort by the index (days)
    .cumsum()
    .head(10)
)


In [0]:

user_info_df[user_info_df['wonky_study_count'] > 0]['days_active_before_task'].value_counts()

In [0]:
(
    user_info_df[user_info_df['wonky_study_count'] > 0]['days_active_before_task']
    .value_counts()
    .div(len(user_info_df[user_info_df['wonky_study_count'] > 0]))
    .sort_index()          # sort by the index (days)
    .cumsum()
    .head(10)
)


In [0]:
user_info_df_shortened = user_info_df[['respondentPk', 'days_active_before_task', 'wonky_study_count']]

In [0]:
series = user_info_df_shortened['days_active_before_task']

daysactive_dummies = (
    series.explode()                    
     .astype(str).str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

bins = [2, 7, 14, 21, 28, 31, 50]
for limit in bins:
  print(f'days_active created_<{limit}')
  daysactive_dummies[f'<{limit}'] = daysactive_dummies[[col for col in daysactive_dummies.columns if col.isdigit() and int(col) < limit]].sum(axis=1)

daysactive_dummies[f'>=_50'] = daysactive_dummies[[col for col in daysactive_dummies.columns if col.isdigit() and int(col) >= 50]].sum(axis=1)

daysactive_dummies = daysactive_dummies.add_prefix('days_active_')

daysactive_cols = daysactive_dummies.columns

user_info_df_shortened = user_info_df_shortened.join(daysactive_dummies)

In [0]:
daysactive_cols[-7:]

In [0]:
user_info_df_shortened

#### Days active before task - Test

In [0]:
chi_square_results_days_active = perform_chi_square_tests(
    user_info_df_shortened,
    feature_set=daysactive_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

display(chi_square_results_days_active.reset_index())

In [0]:
chi_days_active_table = chi_square_results_days_active.reset_index()

In [0]:
chi_days_active_table['days'] = chi_days_active_table['feature'].str.split('_').str[-1]

In [0]:
chi_days_active_table[['days', 'chi2']]

In [0]:
chi_days_active_table[chi_days_active_table['significant']].sort_values('days')

THIS HAS CHANGED!!!! TO CHECK WHY!!!

In [0]:
chi_days_active_table['days'] = pd.to_numeric(chi_days_active_table['days'], errors='coerce')

In [0]:
fig = px.scatter(
    chi_days_active_table[chi_days_active_table['significant']].sort_values('days'),
    x="days",
    y="chi2",
    trendline="ols"
)

fig.update_layout(
    xaxis_title="Days",
    yaxis_title="Chi-square statistic"
)

fig.show()

not directional but shows magnitude of differences
 

huge 7&6 day spike low impact for 1 day and 62 seen as smallest impact suggest danger zone of 1 week and 2 and half months (ish)

In [0]:
# redundant due to specifically for continuous data. useful for respondent features further down.

# mannwhitney_results_days_active = perform_mannwhitney_tests(
#     user_info_df_shortened, daysactive_cols, group_var="wonky_study_count"
# )
# display(mannwhitney_results_days_active.reset_index())

In [0]:
ztest_results_days_active = perform_two_proportion_z_tests(
    user_info_df_shortened, daysactive_cols, group_var="wonky_study_count"
)
display(ztest_results_days_active[ztest_results_days_active['significant']].reset_index())

Sorting by days active (days from first task to task complete) no clear pattern but 6 & 7 days look to be more risky -> users hitting 1 week mark potentially less risky with

also potential cyclical nature of risks wonky behaviour also shown in users hitting the 1 month mark.

safest zone is 1 day mark and 0 day mark.

higher numbers from 50+ tend to reflect safer zones too indicating better behaviour deep into tenure.

potential consideration >> intial engagement is good, 1 week risk, followed by 1 month-1.5month cyclical risk >> after 2.5 months a little safer

deprioritize users exactly at 6, 7 and 48 days. prioritise 0 and 1 dayers and investigate 7 specifically as that stands out.

In [0]:
ztest_days_active_table = ztest_results_days_active[ztest_results_days_active['significant']].reset_index()
ztest_days_active_table['days'] = ztest_days_active_table['feature'].str.split('_').str[-1]
ztest_days_active_table['days'] = pd.to_numeric(ztest_days_active_table['days'], errors='coerce')
ztest_days_active_table = ztest_days_active_table.sort_values('days')
ztest_days_active_table['days'] = ztest_days_active_table['days'].astype(str)

In [0]:
df = ztest_days_active_table[['days', 'z_statistic', 'proportion_diff']]

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(
        x=df["days"],
        y=df["z_statistic"],
        name="z statistic",
        marker_color="steelblue",
        opacity=0.7,
    ),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(
        x=df["days"],
        y=df["proportion_diff"],
        name="proportion diff",
        mode="lines+markers",
        line=dict(color="indianred", width=2),
    ),
    secondary_y=True,
)

fig.update_layout(
    xaxis_title="Days since first task_complete",
)

fig.update_yaxes(title_text="z statistic", secondary_y=False)
fig.update_yaxes(title_text="proportion diff (%pp)", secondary_y=True)

fig.show()

In [0]:
main_features += list(daysactive_cols)
main_features += ['days_active_before_task']

#### Temporal Feature Analysis & Breakdowns - EDA

Analyzing temporal patterns to identify differences between wonky and non-wonky study tasks.


In [0]:
user_info_df['wonky_study_count'].unique()

In [0]:
# Create time features using modular function
user_info_df = create_task_temporal_features(user_info_df, date_col="date_completed")

print(f"Night tasks: {user_info_df['is_night'].mean()*100:.1f}%")
print(f"Weekend tasks: {user_info_df['is_weekend'].mean()*100:.1f}%")

In [0]:
temporal_features = [
    "is_weekend",
    "is_night",
    "is_business_hour",
    "is_business_hour_weekday",
    "is_business_hour_weekend",
    "is_monday",
    "is_tuesday",
    "is_wednesday",
    "is_thursday",
    "is_friday",
    "is_saturday",
    "is_sunday",
]

print(
    create_breakdown_summary(
        user_info_df,
        features=temporal_features,
        group_col="wonky_study_count",
        group_threshold=0,
    )
)

In [0]:
fig = create_breakdown_chart(
    user_info_df,
    features=temporal_features,
    group_col="wonky_study_count",
    group_threshold=0)
fig.show()

In [0]:
user_info_df[temporal_features]

**Task complete time - definately good gauge. Majority takes place during business hours, relatively evenly spread across the work week LARGEST detla where wonky is more prevalent is in business hours suggesting professional behaviours
**

#### Temporal Feature Analysis & Breakdowns - Testing

In [0]:
sorted(user_info_df.columns)

In [0]:

user_info_df[temporal_features + ['wonky_study_count']]

In [0]:
chi_square_results_temporal_features = perform_chi_square_tests(
    user_info_df,
    feature_set=temporal_features,
    group_var='wonky_study_count',
    significance_level=0.01
)
display(chi_square_results_temporal_features.reset_index())

In [0]:
ztest_results_temporal_feature = perform_two_proportion_z_tests(user_info_df, temporal_features, group_var='wonky_study_count')
display(ztest_results_temporal_feature[ztest_results_temporal_feature['significant']].reset_index())

In [0]:
df = ztest_results_temporal_feature.reset_index()

In [0]:
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(
        x=df["feature"],
        y=df["z_statistic"],
        name="z statistic",
        marker_color="steelblue",
        opacity=0.8,
    ),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(
        x=df["feature"],
        y=df["proportion_diff"],
        name="proportion diff",
        mode="lines+markers",
        line=dict(color="indianred", width=2),
    ),
    secondary_y=True,
)

fig.update_layout(
    xaxis_title="Feature",
    yaxis_title="z statistic",
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
)

fig.update_yaxes(title_text="z statistic", secondary_y=False)
fig.update_yaxes(title_text="proportion diff", secondary_y=True)

fig.show()



going with chi squared and z test due to binary outcome

chi is good for magnitude (just association) and z test give directionality 

chi2 - strong association across all but biggest magnitude is friday and business hours; weekend and mid week less of a signal >> business hours x [friday, monday, thursday] could be indicative 

ztest - friday (huge effect), is business hour and is busines hour weekday largest impact for wonky studies. lower risk days are mondays, thursday and night.
- no strong signals from weekends and mid week (tuesday wednesday)

all pretty solid (largely due to large sample)

potential end of week rush effect for friday (earning targets, fatigue or rushing for beers)

friday shows 8pp difference pro wonky
business hours show 5-6pp difference pro wonky
- multi tasking with jobs or potentially using work devices (to investigate)
- monday, thursday dampen business hour effects.

business hour effect likely driven by friday effect

night time effect supported by chi2 and ztest 2.3pp difference

takeaway:
users start week with higher focus
mid week is pretty okay (nothing standing out too much)

may be an engagement & rushing problem with friday being main  culprit

things to consider: selection bias -> users complete surveys on these days at varying levels of engagement 
- responsible work focused people might do it on monday as a routine
- casual or rush people tend to do this on fridays
- therefore observe monday as safter and friday as less risky

In [0]:
from eda.statistical_tests import temporal_analyze_selection_bias

selection_bias_df = temporal_analyze_selection_bias(user_info_df, day_safe_col='is_monday', day_risky_col='is_friday', min_tasks=5)

selection_bias_df

controlling for the selection bias and looking at average wonky rates for friday being almost double monday! [using paired t test] suggesting the day effect might be real

In [0]:
main_features += list(temporal_features)

STRONG SIGNIFICANT READ AT 99% LEVEL LARGEST MAGNITUDE FOUND AT NIGHT. LOWEST MAGNITUDE DURING WEEKEND

**Bar is the level of siginficance between the wonky and non wonky, the line are the delta's between wonky and non wonky in terms of when tasks are complete.

positive delta means wonky participants are more prevalent and negative delta means they are less prevalent.

Business hours, Night time, Saturdays look like the overall best separators between wonky and non wonky participants in terms of task complete time
**

#### Task speed features - EDA

In [0]:
# capping because of very anomalous time throwing off the average

user_info_df['task_time_taken_s_capped'] = np.where(user_info_df['task_time_taken_s'] < user_info_df['task_time_taken_s'].quantile(0.9999), user_info_df['task_time_taken_s'], user_info_df['task_time_taken_s'].quantile(0.9999))

In [0]:
sorted(user_info_df.columns)

In [0]:
[col for col in user_info_df.columns if 'length' in col]

In [0]:
user_info_df['task_time_taken_s_capped'].max()

In [0]:
user_info_df

In [0]:
# user_info_df = create_task_speed_features(
#     user_info_df,
#     task_time_col="task_time_taken_s_capped",
#     use_std_dev=True,
#     group_by_col="task_length_of_task",
#     min_group_size=5    
# )

# mean_time = user_info_df["task_time_taken_s_capped"].mean()
# std_time = user_info_df["task_time_taken_s_capped"].std()
# print(f"Task time statistics:")
# print(f"  Mean: {mean_time:.2f} seconds ({mean_time/60:.2f} minutes)")
# print(f"  Std Dev: {std_time:.2f} seconds ({std_time/60:.2f} minutes)")
# print(f"  Fast threshold (mean - 1σ): {mean_time - std_time:.2f}s")
# print(f"  Suspiciously fast threshold (mean - 2σ): {mean_time - 2*std_time:.2f}s")
# print(f"  Slow threshold (mean + 1σ): {mean_time + std_time:.2f}s")
# print(f"  Suspiciously slow threshold (mean + 2σ): {mean_time + 2*std_time:.2f}s")
# print()

# # Display breakdown with wonky vs non-wonky comparison
# print(create_task_speed_breakdown_summary(
#     user_info_df,
#     group_col='wonky_study_count',
#     group_threshold=0
# ))

**Wonky participants are usually suspcisouly fast to normal non wonky participants tend to be normal to supcisouly slow in terms of delta **

In [0]:
user_info_df = create_task_speed_features(
    user_info_df,
    task_time_col="task_time_taken_s",
    use_std_dev=True
)

fast_threshold = user_info_df["task_time_taken_s"].quantile(0.16)
suspiciously_fast_threshold = user_info_df["task_time_taken_s"].quantile(0.025)
slow_threshold = user_info_df["task_time_taken_s"].quantile(0.84)
suspiciously_slow_threshold = user_info_df["task_time_taken_s"].quantile(0.975)

# Also calculate trimmed mean/std for reference (trimming extreme outliers)
trimmed_data = user_info_df["task_time_taken_s"].clip(
    lower=user_info_df["task_time_taken_s"].quantile(0.01),
    upper=user_info_df["task_time_taken_s"].quantile(0.99)
)
mean_time = trimmed_data.mean()
std_time = trimmed_data.std()

print(f"Task time statistics (using percentiles, robust to outliers):")
print(f"  Mean (trimmed 1%-99%): {mean_time:.2f} seconds ({mean_time/60:.2f} minutes)")
print(f"  Std Dev (trimmed 1%-99%): {std_time:.2f} seconds ({std_time/60:.2f} minutes)")
print(f"  Fast threshold (16th percentile): {fast_threshold:.2f}s ({fast_threshold/60:.2f} min)")
print(f"  Suspiciously fast threshold (2.5th percentile): {suspiciously_fast_threshold:.2f}s ({suspiciously_fast_threshold/60:.2f} min)")
print(f"  Slow threshold (84th percentile): {slow_threshold:.2f}s ({slow_threshold/60:.2f} min)")
print(f"  Suspiciously slow threshold (97.5th percentile): {suspiciously_slow_threshold:.2f}s ({suspiciously_slow_threshold/60:.2f} min)")
print()

# Display breakdown with wonky vs non-wonky comparison
group_col_to_use = 'wonky_task_instances' if 'wonky_task_instances' in user_info_df.columns else 'wonky_study_count'
print(create_task_speed_breakdown_summary(
    user_info_df,
    group_col=group_col_to_use,
    group_threshold=0
))

In [0]:
user_info_df['task_length_of_task'].unique()

#### Task speed features - Tests

In [0]:
speed_features = ['is_suspiciously_fast', 'is_fast', 'is_normal_speed', 'is_slow', 'is_suspiciously_slow']

In [0]:
user_info_df[speed_features + ['wonky_study_count']]

In [0]:
chi_square_results_task_speeds = perform_chi_square_tests(
    user_info_df,
    feature_set=speed_features,
    group_var='wonky_study_count',
    significance_level=0.01
)
display(chi_square_results_task_speeds.reset_index())

In [0]:
ztest_results_task_speeds = perform_two_proportion_z_tests(
    user_info_df, speed_features, group_var="wonky_study_count"
)
display(ztest_results_task_speeds.reset_index())

In [0]:
main_features += speed_features

normal speed has highest magnitude and positive direction in z test -> counter intuitive -> problem could be comprehension rather than rushing/gaming

fast defined as -> 1 standard deviation faster than the average time of the group

fast seems relative safe and slow is safe suggesting might not be a speed thing and more comprehension issue


TODO - calibrate speeds to account for points or survey types (unsure which is best indicator - ask tim or dan)

TODO -> Come up with good Viz across test

#### Device - EDA

In [0]:
series = user_info_df['hardware_version']

hardware_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)         
     .groupby(level=0).sum()     
)
hardware_cols = hardware_dummies.columns

user_info_df = user_info_df.join(hardware_dummies)

In [0]:
hardware_cols

In [0]:
print(
    create_breakdown_summary(
        user_info_df,
        features=hardware_cols,
        group_col="wonky_study_count",
        group_threshold=0,
    )
)

huge delta for desktop, but this might be a sample thing. may converge better with 3-6 months of data.

#### Device - Tests

In [0]:
user_info_df[list(hardware_cols) + ['wonky_study_count']]

In [0]:
chi_square_results_hardware = perform_chi_square_tests(
    user_info_df,
    feature_set=hardware_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

display(chi_square_results_hardware.reset_index())

In [0]:
ztest_results_hardware = perform_two_proportion_z_tests(
    user_info_df, hardware_cols, group_var="wonky_study_count"
)
display(ztest_results_hardware[ztest_results_hardware['significant']].reset_index())

In [0]:
user_info_df['hardware_version'].unique()

In [0]:
user_info_df[['hardware_version']].fillna('unknown').value_counts()/len(user_info_df)

In [0]:
user_info_df[user_info_df['wonky_study_count'] > 0][['hardware_version']].fillna('unknown').value_counts()/len(user_info_df[user_info_df['wonky_study_count'] > 0])

high signal in desktops and iphones others are more neglible. largely driven by volume of general participants but seems like needs to be included in model

something to be investiagated in desktop process especially as usually iphone users tend to be high quality

In [0]:
main_features += ['Desktop', 'Iphone']

#### Platform - EDA

In [0]:
series = user_info_df['platform_name']

platform_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

platform_cols = platform_dummies.columns

user_info_df = user_info_df.join(platform_dummies)

In [0]:
platform_cols

In [0]:
print(
    create_breakdown_summary(
        user_info_df,
        features=platform_cols,
        group_col="wonky_study_count",
        group_threshold=0,
    )
)

In [0]:
user_info_df[['platform_name']].fillna('unknown').value_counts()/len(user_info_df)

In [0]:
user_info_df[user_info_df['wonky_study_count'] > 0][['platform_name']].fillna('unknown').value_counts()/len(user_info_df[user_info_df['wonky_study_count'] > 0])

#### Platform - Tests

In [0]:
user_info_df[list(platform_cols) + ['wonky_study_count']]

In [0]:
chi_square_results_platform = perform_chi_square_tests(
    user_info_df,
    feature_set=platform_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

display(chi_square_results_platform.reset_index())

In [0]:
ztest_results_platform = perform_two_proportion_z_tests(
    user_info_df, platform_cols, group_var="wonky_study_count"
)
display(ztest_results_platform.reset_index())

Similarly to desktop big magnitude in Linux and iOS but could be just due to volumes

Linux is a platofmr not used by most people though >> needs a follow up question
- sometimes used alot by bot farmers etc

In [0]:
main_features += ['Linux', 'iOS']

#### Gambling - EDA

In [0]:
series = user_info_df['gambling_participation_mc']

# One-hot encode each gambling mode
gambling_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

gambling_cols = gambling_dummies.columns

user_info_df = user_info_df.join(gambling_dummies)

In [0]:
gambling_dummies

In [0]:
gambling_cols

In [0]:
print(
    create_breakdown_summary(
        user_info_df,
        features=gambling_cols,
        group_col="wonky_study_count",
        group_threshold=0,
    )
)

In [0]:
fig = create_breakdown_chart(
    user_info_df,
    features=gambling_cols,
    group_col="wonky_study_count",
    group_threshold=0)
fig.show()

#### Gambling - Tests

In [0]:
user_info_df[list(gambling_cols) + ['wonky_study_count']]

In [0]:
chi_square_results_gambling = perform_chi_square_tests(
    user_info_df,
    feature_set=gambling_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

display(chi_square_results_gambling.reset_index())

In [0]:
ztest_results_gambling = perform_two_proportion_z_tests(
    user_info_df, gambling_cols, group_var="wonky_study_count"
)
display(ztest_results_gambling.reset_index())

In [0]:
df = ztest_results_gambling.reset_index()

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(
        x=df["feature"],
        y=df["z_statistic"],
        name="z statistic",
        marker_color="steelblue",
        opacity=0.8,
    ),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(
        x=df["feature"],
        y=df["proportion_diff"],
        name="proportion diff",
        mode="lines+markers",
        line=dict(color="indianred", width=2),
    ),
    secondary_y=True,
)

fig.update_layout(
    xaxis_title="Feature",
    yaxis_title="z statistic",
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
)

fig.update_yaxes(title_text="z statistic", secondary_y=False)
fig.update_yaxes(title_text="proportion diff", secondary_y=True)

fig.show()


counter to the expected results almost every form of gambling associated with lower risk to wonky studies suggests less of a gaming issue again and more of a comprehension issue.

maybe only gamblers are actually better at reading and understanding what they're doing online (attention to detail etc) resulting in less wonky studies

In [0]:
main_features += list(gambling_cols)

#### Income - EDA

In [0]:
income_map = {
    "A": "Less than £15,000",
    "B": "£15,000 to £19,999",
    "C": "£20,000 to £24,999",
    "D": "£25,000 to £29,999",
    "E": "£30,000 to £34,999",
    "F": "£35,000 to £39,999",
    "G": "£40,000 to £44,999",
    "H": "£45,000 to £49,999",
    "I": "£50,000 to £59,999",
    "J": "£60,000 to £74,999",
    "K": "£75,000 to £84,999",
    "L": "£85,000 to £99,999",
    "M": "£100,000 to £124,999",
    "N": "£125,000 to £149,999",
    "O": "£150,000 to £174,999",
    "P": "£175,000 to £199,999",
    "Q": "£200,000 and above",
    "R": "Prefer not to answer",
}

user_info_df["fulcrum_household_income_mapped"] = (
    user_info_df["fulcrum_household_income"].map(income_map)
)

user_info_df["fulcrum_household_income_mapped"].value_counts()/len(user_info_df)

In [0]:
(user_info_df["fulcrum_household_income_mapped"].value_counts()/len(user_info_df)).T

In [0]:
user_info_df['fulcrum_household_income_mapped']

In [0]:
series = user_info_df['fulcrum_household_income_mapped']

# One-hot encode each gambling mode
income_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

income_cols = income_dummies.columns
user_info_df = user_info_df.join(income_dummies)

In [0]:
import re

def map_income_to_bucket(income_str):
  if pd.isna(income_str):
    return "unknown."
  
  if 'Prefer not to answer' in income_str:
    return "Prefer not to answer"
  
  match = re.search(r'£([\d,]+)', income_str)
  if match:
      lower_bound = int(match.group(1).replace(',', ''))
      
      if lower_bound < 35000:
          return 'Less than £35k'
      elif lower_bound < 50000:
          return '£35k to £49k'
      elif lower_bound < 80000:
          return '£50k to £79k'
      elif lower_bound < 100000:
          return '£80k to £99k'
      else:
          return '£100k or more'
  
  return 'Unknown'

In [0]:
user_info_df['fulcrum_household_income_mapped'].apply(map_income_to_bucket)

In [0]:
income_buckets = user_info_df['fulcrum_household_income_mapped'].apply(map_income_to_bucket)

income_bucket_dummies = pd.get_dummies(income_buckets, prefix='income_bucket')

user_info_df = user_info_df.join(income_bucket_dummies)

In [0]:
income_buckets.unique() 

In [0]:
user_info_df[user_info_df['wonky_study_count'] > 0]['fulcrum_household_income_mapped'].value_counts()/len(user_info_df[user_info_df['wonky_study_count'] > 0])

In [0]:
user_info_df[list(income_cols) + ['wonky_study_count']]

In [0]:
fig = create_breakdown_chart(
    user_info_df,
    features=income_cols,
    group_col="wonky_study_count",
    group_threshold=0)
fig.show()

most wonky studies are in lower income groups

In [0]:
income_cols

#### Income - Test

In [0]:
chi_square_results_income = perform_chi_square_tests(
    user_info_df,
    feature_set=income_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

display(chi_square_results_income.reset_index())

In [0]:
ztest_results_income = perform_two_proportion_z_tests(
    user_info_df, income_cols, group_var="wonky_study_count"
)
display(ztest_results_income.reset_index())

In [0]:
df = ztest_results_income.reset_index()

In [0]:
def income_lower_bound(s):
    s = str(s)
    if "Less than" in s:
        return 0
    if "Prefer not" in s:
        return float("inf")
    s_clean = s.replace("£", "").replace(",", "")
    if "to" in s_clean:
        return float(s_clean.split("to")[0])
    if "and above" in s_clean:
        return float(s_clean.split("and")[0])
    return float("inf")

In [0]:
df['feature'].map(income_lower_bound)

In [0]:
df['new_income_col'] = df['feature'].map(income_lower_bound)

In [0]:
df

In [0]:
# import plotly.graph_objects as go
# from plotly.subplots import make_subplots

# df = df.sort_values(by='new_income_col')

# fig = make_subplots(specs=[[{"secondary_y": True}]])

# fig.add_trace(
#     go.Bar(
#         x=df["new_income_col"],
#         y=df["z_statistic"],
#         name="z statistic",
#         marker_color="steelblue",
#         opacity=0.8,
#     ),
#     secondary_y=False,
# )

# fig.add_trace(
#     go.Scatter(
#         x=df["new_income_col"],
#         y=df["proportion_diff"],
#         name="proportion diff",
#         mode="lines+markers",
#         line=dict(color="indianred", width=2),
#     ),
#     secondary_y=True,
# )

# fig.update_layout(
#     xaxis_title="Feature",
#     yaxis_title="z statistic",
#     legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
# )

# fig.update_yaxes(title_text="z statistic", secondary_y=False)
# fig.update_yaxes(title_text="proportion diff", secondary_y=True)

# fig.show()


In [0]:
df["x_num"] = df["new_income_col"]  # from earlier parsing

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(
        x=df["x_num"],
        y=df["z_statistic"],
        name="z statistic",
        marker_color="steelblue",
        opacity=0.8,
    ),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(
        x=df["x_num"],
        y=df["proportion_diff"],
        name="proportion diff",
        mode="lines+markers",
        line=dict(color="indianred", width=2),
    ),
    secondary_y=True,
)

fig.update_layout(
    bargap=0.05,
    bargroupgap=0.0,
    xaxis=dict(
        title="Income band",
        tickmode="array",
        tickvals=df["x_num"],
        ticktext=df["feature"],
    ),
)

fig.update_xaxes(tickangle=-45)

In [0]:
main_features += list(income_cols)

In [0]:
len(main_features)

#### Income Gender - EDA

In [0]:
series = user_info_df['gender']

# One-hot encode each gambling mode
gender_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

gender_dummies = gender_dummies.add_prefix('gender_')

gender_cols = gender_dummies.columns

user_info_df = user_info_df.join(gender_dummies)

In [0]:
main_features += list(gender_cols) 
main_features

In [0]:
user_info_df['gender_fulcrum_household_income_mapped'] = user_info_df['gender'].astype(str) + "_" + user_info_df['fulcrum_household_income_mapped'].astype(str)

In [0]:

series = user_info_df['gender_fulcrum_household_income_mapped']

# One-hot encode each gambling mode
gender_income_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

gender_income_cols = gender_income_dummies.columns

user_info_df = user_info_df.join(gender_income_dummies)

In [0]:
(user_info_df["gender_fulcrum_household_income_mapped"].value_counts() / len(user_info_df))

In [0]:
user_info_df[user_info_df['wonky_study_count'] > 0]['gender_fulcrum_household_income_mapped'].value_counts()/len(user_info_df[user_info_df['wonky_study_count'] > 0])

In [0]:
user_info_df[list(gender_income_cols) + ['wonky_study_count']]

In [0]:
fig = create_breakdown_chart(
    user_info_df,
    features=gender_income_cols,
    group_col="wonky_study_count",
    group_threshold=0)
fig.show()

strong cut off category of female less than 15k

#### Income Gender - Tests

In [0]:
chi_square_results_income_gender = perform_chi_square_tests(
    user_info_df,
    feature_set=gender_income_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

display(chi_square_results_income_gender[chi_square_results_income_gender['significant']].reset_index())

In [0]:
ztest_results_income_gender = perform_two_proportion_z_tests(
    user_info_df, gender_income_cols, group_var="wonky_study_count"
)
display(ztest_results_income_gender[ztest_results_income_gender['significant']].reset_index())

In [0]:
(
    df["feature"]
    .str.replace("male_|m", "M")
    .str.replace("fe|f", "F")
    .str.replace(",000", "k")
    .str.replace(",999", "k")
    .str.replace(" and above", "+")
    .str.replace("prer not to say", "no_answer")
    .str.replace("Prer not to answer", "no_answer")
    .str.replace("Less than ", "<")
)

In [0]:

import re
import numpy as np

# assume df is your DataFrame and 'feature' is the column name
def extract_max_salary(s: str) -> float:
    m = re.search(r"£(\d+)\s*k\+?", s)
    if m:
        return float(m.group(1)) * 1000

    m = re.search(r"£(\d+)\s*k\s*to\s*£(\d+)\s*k", s)  
    if m:
        return float(m.group(2)) * 1000

    m = re.search(r"<£(\d+)\s*k", s)         
    if m:
        return float(m.group(1)) * 1000

    return np.nan

df["max_salary"] = df["feature"].apply(extract_max_salary)

In [0]:
df

In [0]:
df = ztest_results_income_gender.reset_index()

df["feature"] = (
    df["feature"]
    .str.replace("prefer not to say", "no_answer")
    .str.replace("Prefer not to answer", "no_answer")
    .str.replace(",000", "k")
    .str.replace(",999", "k")
    .str.replace(" and above", "+")
    .str.replace("Less than ", "<")
    .str.replace("female_", "F_")
    .str.replace("male_", "M_")
)

df["max_salary"] = df["feature"].apply(extract_max_salary)

df = df.sort_values('max_salary')

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(
        x=df["feature"],
        y=df["z_statistic"],
        name="z statistic",
        marker_color="steelblue",
        opacity=0.8,
    ),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(
        x=df["feature"],
        y=df["proportion_diff"],
        name="proportion diff",
        mode="lines+markers",
        line=dict(color="indianred", width=2),
    ),
    secondary_y=True,
)

fig.update_layout(
    xaxis_title="Feature",
    yaxis_title="z statistic",
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
)

fig.update_yaxes(title_text="z statistic", secondary_y=False)
fig.update_yaxes(title_text="proportion diff", secondary_y=True)

fig.show()


aim is to determine if there is a socio economic & gender divide in survey quality

lower incomes seem to carry significantly higher risk (again possible a comprehension or difficulty understanding the task issue)

biggest safe zones are 100k + incomes 
(200k lowest risk -5.1pp risk) with strong significance

high income correlating with less stress about money better quality answer, tech literacy and other motivations other than money for filling in surveys

highest risk in the 25k-30k category (+2.8pp)

high earning males statistically safest demograpic -> tech literate & professional demographic

riskiest cohorts:
- females 45k-50k (+1.8pp)

safe cohorts (both genders 100k+)
less safe (males < 100k, females 25k-50k)
high risk (females 25k-50k)

professionals with high income correlate with high quality.

low income shouldn't be blocked but managed >> consider other tests or qualifiers for comprehension ability

OVERALL FROM BASIC STAT TESTS KEY TAKE AWAYS

Days Active: Day 7 is critical risk.

Temporal: Friday & Business Hours are risky; Nights are safe.

Speed: Normal speed is paradoxically risky; Fast is safe.

Device/Platform: Desktop/Linux/iPhone are critical risks.

Gambling: Gamblers are safe/smart users.

Demographics: High earners are safe.

In [0]:
sorted(user_info_df.columns)

In [0]:
wonky_inc = wonky_respondent_summary.merge(user_info_df[list(income_cols) + ['respondentPk', 'gender']], left_on='respondentPk', right_on='respondentPk', how='left')

income_long = (
    wonky_inc[["respondentPk", "exposure_band", "gender"] + list(income_cols)]
      .melt(
          id_vars=["respondentPk", "exposure_band", "gender"],
          value_vars=income_cols,
          var_name="income_bracket",
          value_name="flag",
      )
)

income_long = income_long[income_long["flag"] == 1]

counts = (
    income_long
    .groupby(["income_bracket", "exposure_band", "gender"])
    .agg(n=("respondentPk", "nunique"))
    .reset_index()
)

counts["group_total"] = (
    counts
    .groupby(["exposure_band", "gender"])["n"]
    .transform("sum")
)
counts["pct"] = counts["n"] / counts["group_total"] * 100

counts.display()

In [0]:
wonky_inc.display()

#### Risk

In [0]:
user_info_df['risk=100'] = np.where(user_info_df['risk'] == 100, 1, 0)
user_info_df['risk<90'] = np.where(user_info_df['risk'] < 90, 1, 0)
user_info_df['risk<80'] = np.where(user_info_df['risk'] < 80, 1, 0)
user_info_df['risk<50'] = np.where(user_info_df['risk'] < 50, 1, 0)

In [0]:
risk_features = ['risk=100', 'risk<90', 'risk<80', 'risk<50']

In [0]:
fig = create_breakdown_chart(
    user_info_df,
    features=risk_features,
    group_col="wonky_study_count",
    group_threshold=0)
fig.show()

In [0]:
ztest_results_risk = perform_two_proportion_z_tests(
    user_info_df, risk_features, group_var="wonky_study_count"
)
display(ztest_results_risk[ztest_results_risk["significant"]].reset_index())

In [0]:
##### DELTA TEST

In [0]:
user_info_df = add_rating_delta(
    df=user_info_df,
    feature='risk',
    delta_period='max'
)

print(f'new array of unique deltas created: {user_info_df["risk_delta"].unique()}')

user_info_df['risk_delta_LargePostive'] = np.where(user_info_df['risk_delta'] > 50, 1, 0)
user_info_df['risk_delta_Postive'] = np.where(user_info_df['risk_delta'] > 0, 1, 0)
user_info_df['risk_delta_Neutral'] = np.where(user_info_df['risk_delta'] ==  0, 1, 0)
user_info_df['risk_delta_LargeNegative'] = np.where(user_info_df['risk_delta'] < -50, 1, 0)
user_info_df['risk_delta_Negative'] = np.where(user_info_df['risk_delta'] < 0, 1, 0)

In [0]:
risk_delta_features = ['risk_delta_LargePostive', 'risk_delta_Postive', 'risk_delta_Neutral', 'risk_delta_LargeNegative', 'risk_delta_Negative']

fig = create_breakdown_chart(
    user_info_df,
    features=risk_delta_features,
    group_col="wonky_study_count",
    group_threshold=0)
fig.show()

In [0]:
ztest_results_risk_delta = perform_two_proportion_z_tests(
    user_info_df, risk_delta_features, group_var="wonky_study_count"
)
display(ztest_results_risk_delta.reset_index())

#### Quality

In [0]:
user_info_df["quality=100"] = np.where(user_info_df["quality"] == 100, 1, 0)
user_info_df["quality<90"] = np.where(user_info_df["quality"] < 90, 1, 0)
user_info_df["quality<75"] = np.where(user_info_df["quality"] < 75, 1, 0)
user_info_df["quality<50"] = np.where(user_info_df["quality"] < 50, 1, 0)
user_info_df["quality<30"] = np.where(user_info_df["quality"] < 30, 1, 0)

In [0]:
quality_features = ["quality=100", "quality<90", "quality<75", "quality<50", "quality<30"]

fig = create_breakdown_chart(
    user_info_df,
    features=quality_features,
    group_col="wonky_study_count",
    group_threshold=0)
fig.show()

In [0]:
ztest_results_quality = perform_two_proportion_z_tests(
    user_info_df, quality_features, group_var="wonky_study_count"
)
display(ztest_results_quality[ztest_results_quality["significant"]].reset_index())

In [0]:
### DELTA'S CHECK

In [0]:
user_info_df = add_rating_delta(
    df=user_info_df,
    feature='quality',
    delta_period='max'
)

print(f'new array of unique deltas created: {user_info_df["quality_delta"].unique()}')

user_info_df['quality_delta_LargePostive'] = np.where(user_info_df['quality_delta'] > 50, 1, 0)
user_info_df['quality_delta_Postive'] = np.where(user_info_df['quality_delta'] > 0, 1, 0)
user_info_df['quality_delta_Neutral'] = np.where(user_info_df['quality_delta'] ==  0, 1, 0)
user_info_df['quality_delta_LargeNegative'] = np.where(user_info_df['quality_delta'] < -50, 1, 0)
user_info_df['quality_delta_Negative'] = np.where(user_info_df['quality_delta'] < 0, 1, 0)

In [0]:
quality_delta_features = ['quality_delta_LargePostive', 'quality_delta_Postive', 'quality_delta_Neutral', 'quality_delta_LargeNegative', 'quality_delta_Negative']

fig = create_breakdown_chart(
    user_info_df,
    features=quality_delta_features,
    group_col="wonky_study_count",
    group_threshold=0)
fig.show()

In [0]:
ztest_results_quality_delta = perform_two_proportion_z_tests(
    user_info_df, quality_delta_features, group_var="wonky_study_count"
)
display(ztest_results_quality_delta.reset_index())

In [0]:
# user_info_df[~user_info_df['exposure_band'].isna()]['quality'].value_counts() 

# user_info_df[~user_info_df['exposure_band'].isna()]['quality'].value_counts() / len(user_info_df[~user_info_df['exposure_band'].isna()])

# user_info_df['quality'].value_counts() / len(user_info_df)

# (user_info_df['quality'].fillna(0).value_counts() / len(user_info_df)).reset_index().display()

# (user_info_df[user_info_df['wonky_study_count'] > 0]['quality'].value_counts() / len(user_info_df[user_info_df['wonky_study_count'] > 0])).reset_index().display()

#### Exposure bands

In [0]:
user_info_df['exposure_band'].value_counts()/len(user_info_df)

In [0]:
wonky_respondent_df['exposure_band'].value_counts()/len(wonky_respondent_df)

weak correlations across the board some strong ones device related but nothing huge to go by.

Next stage run analysis on 3 months of data 

In [0]:
user_info_2_v2_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files'].get('user_info_df_post_eda')))

user_info_df.to_parquet(user_info_2_v2_path, index=False)