### create dataset

In [9]:
import pandas as pd
import random
import numpy as np
from datetime import datetime, timedelta

def generate_contributor_data(num_sessions=500, num_contributors=30, seed=42):
    """
    Generates a synthetic dataset where each row represents a single 'session'
    from a contributor labeling data.

    :param num_sessions: Total number of rows (sessions) to generate
    :param num_contributors: Unique contributors in the system
    :param seed: Random seed for reproducibility
    :return: A pandas DataFrame with synthetic contributor data
    """

    random.seed(seed)
    np.random.seed(seed)

    # -- Basic setup for random data generation --
    # We'll define some plausible category choices and distributions.
    contributors = [f"C{i+1}" for i in range(num_contributors)]
    region_choices = ["USA", "APAC", "EMEA", "LATAM"]
    platform_choices = ["Platform A", "Platform B"]
    difficulty_choices = ["easy", "medium", "hard"]
    task_type_choices = ["text_labeling", "image_bounding", "3d_point_cloud", "video_labeling"]

    # We'll assign a random 'join_date' to each contributor between 2022-01-01 and 2023-12-31
    start_2022 = datetime(2022, 1, 1)
    end_2023 = datetime(2023, 12, 31)

    def random_date(start, end):
        """Generate a random datetime between two datetime objects."""
        delta = end - start
        rand_days = random.randint(0, delta.days)
        return start + timedelta(days=rand_days)

    contributor_join_dates = {
        c: random_date(start_2022, end_2023) for c in contributors
    }

    # We track 'cumulative_hours_worked' for each contributor across sessions
    cumulative_hours = {c: 0.0 for c in contributors}

    data = []

    # We'll generate sessions in the year 2023 for simplicity
    start_2023 = datetime(2023, 1, 1)
    end_2023_dt = datetime(2023, 12, 31)

    for session_id in range(1, num_sessions + 1):
        # 1) Pick a random contributor
        c = random.choice(contributors)

        # 2) Generate a random start time within 2023
        delta_days = (end_2023_dt - start_2023).days
        session_start = start_2023 + timedelta(
            days=random.randint(0, delta_days),
            hours=random.randint(0, 23),
            minutes=random.randint(0, 59)
        )

        # 3) Random session length in minutes (20 to 180 minutes)
        session_length = random.randint(20, 180)
        session_end = session_start + timedelta(minutes=session_length)

        # 4) Tasks completed in this session
        tasks_completed = random.randint(20, 100)

        # 5) Average correctness score for all tasks in this session
        avg_label_correctness_score = round(random.uniform(0.4, 1.0), 3)

        # 6) Benchmark tasks used (0-5), but not more than tasks_completed
        benchmark_tasks_used = random.randint(0, 5)
        benchmark_tasks_used = min(benchmark_tasks_used, tasks_completed)

        # 7) Benchmark accuracy (slightly correlated with avg_label_correctness_score)
        benchmark_accuracy = round(
            avg_label_correctness_score + random.uniform(-0.1, 0.1), 3
        )
        benchmark_accuracy = max(0.0, min(1.0, benchmark_accuracy))  # clamp 0-1

        # 8) Time per task (seconds)
        time_per_task = (session_length * 60) / tasks_completed

        # 9) Update contributor's cumulative hours
        cumulative_hours[c] += session_length / 60.0

        # 10) Additional metadata
        region = random.choice(region_choices)
        platform = random.choice(platform_choices)
        difficulty = random.choice(difficulty_choices)
        task_type = random.choice(task_type_choices)
        onboarding_completed = (random.random() < 0.7)  # ~70% chance
        join_date = contributor_join_dates[c]

        # 11) Relabel stats & cost
        relabeled_count = random.randint(0, 5)
        relabel_cost = round(relabeled_count * random.uniform(0.5, 1.5), 2)

        # 12) Payment/payout to contributor
        payout = round(tasks_completed * random.uniform(0.05, 0.2), 2)

        # Construct the row
        row = {
            "session_id": session_id,
            "contributor_id": c,
            "start_time": session_start,
            "end_time": session_end,
            "tasks_completed": tasks_completed,
            "avg_label_correctness_score": avg_label_correctness_score,
            "benchmark_tasks_used": benchmark_tasks_used,
            "benchmark_accuracy": benchmark_accuracy,
            "time_per_task": round(time_per_task, 2),
            "cumulative_hours_worked": round(cumulative_hours[c], 2),
            "task_difficulty_level": difficulty,
            "task_type": task_type,
            "onboarding_completed": onboarding_completed,
            "join_date": join_date,
            "relabeled_count": relabeled_count,
            "relabel_cost": relabel_cost,
            "payout": payout,
            "platform": platform,
            "region": region
        }
        data.append(row)

    # Convert to DataFrame
    df = pd.DataFrame(data)
    return df

# Example usage: generate a dataset with 500 sessions and preview
df_contributors = generate_contributor_data(num_sessions=1000, num_contributors=50, seed=42)
print(df_contributors.head(10))
print("\nDataFrame shape:", df_contributors.shape)
print("\nColumn Info:")
print(df_contributors.info())


   session_id contributor_id          start_time            end_time  \
0           1            C35 2023-03-05 12:05:00 2023-03-05 14:46:00   
1           2            C15 2023-02-21 12:17:00 2023-02-21 14:33:00   
2           3            C41 2023-12-19 17:14:00 2023-12-19 18:57:00   
3           4            C42 2023-09-13 12:56:00 2023-09-13 15:13:00   
4           5             C4 2023-02-26 04:40:00 2023-02-26 05:40:00   
5           6            C18 2023-11-25 10:07:00 2023-11-25 11:42:00   
6           7            C13 2023-03-20 11:48:00 2023-03-20 12:49:00   
7           8            C37 2023-02-10 02:46:00 2023-02-10 05:10:00   
8           9            C35 2023-12-20 06:45:00 2023-12-20 08:24:00   
9          10            C15 2023-01-04 02:45:00 2023-01-04 03:20:00   

   tasks_completed  avg_label_correctness_score  benchmark_tasks_used  \
0               57                        0.898                     4   
1               66                        0.498              

# data analysis

In [15]:
df = df_contributors.copy()
df.contributor_id.count()
# df.describe()
# df.head(10)

1000