### Imports and Reading in Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

In [None]:
# Read in data
data = pd.read_csv("automatedSplit_tasks.csv")

### Sorting data into tasks chronologically

In [None]:
# Create unique task ID (seg_id+ID from above dataFrame)
data['unique_task_id'] = data.apply(lambda row: int(str(row.seg_id) + str(row.ID)), axis=1)

# Group each unique task ID together
data = data.sort_values("unique_task_id")

# Convert the opened column to datetime object
data.opened = pd.to_datetime(data.opened).values.astype(np.int64)

# Group data by unique task ids
grouped_data = data.groupby("unique_task_id")

# For each task, create a new column with average opened time
data['avg_opened_time'] = data.apply(lambda row: grouped_data.get_group(row.unique_task_id).opened.mean(), axis=1)

# Sort the tasks in the data df chronologically
data = data.sort_values("avg_opened_time")

data

### Filtering data to only include assignees that have completed all tasks

In [None]:
# Grouping the data by unique_task_id
grouped_data = data.groupby("unique_task_id")

# Creating a dataframe that stores the assignees for each unique task
task_assignees = pd.DataFrame(
    columns=['unique_task_id', 'num_assignees', 'assignees'])

for group_name, group in grouped_data:
    task_assignees.loc[len(task_assignees.index)] = [group_name, len(pd.unique(group.assignee_masked)), pd.unique(group.assignee_masked)]

task_assignees


In [None]:
# Grouping data by each assignee
# Through doing this, we saw that assignee 0 and the expert proofreaders did not complete all 90 tasks.Next step is to filter out assignee 0 and the expert proofreaders.

grouped_by_assignees = data.groupby('assignee_masked')
assignees_tasks = pd.DataFrame(columns = ['assignee_name', 'tasks', 'num_tasks'])

for group_name, group in grouped_by_assignees:
    assignees_tasks.loc[len(assignees_tasks.index)] = [group_name, list(grouped_by_assignees.get_group(
        group_name).unique_task_id), len(list(grouped_by_assignees.get_group(group_name).unique_task_id))]

assignees_tasks

##### From the above cell, we saw that assignee 0 and the expert proofreaders did not complete all 90 tasks.Next step is to filter out assignee 0 and the expert proofreaders.

In [None]:
# Filtering out all asignees that do not have the maximum number of tasks completed
# should filter out assignee 0 and expert proofreaders (100,101,102,103,104,105,106)
assignees_to_keep = list(assignees_tasks[assignees_tasks.num_tasks == assignees_tasks.num_tasks.max()].assignee_name)
data = data[data['assignee_masked'].isin(assignees_to_keep)]

data

In [None]:
# Splitting data into 3 sets of equal number of tasks, chronologically
num_tasks = len(pd.unique(data.unique_task_id))

tasks_for_set_1 = list(pd.unique(data.unique_task_id))[0:int(num_tasks/3)]
tasks_for_set_2 = list(pd.unique(data.unique_task_id))[int(num_tasks/3): 2* int(num_tasks/3)]
tasks_for_set_3 = list(pd.unique(data.unique_task_id))[2 * int(num_tasks/3):]

data_set_1_of_3 = data[data['unique_task_id'].isin(tasks_for_set_1)]
data_set_2_of_3 = data[data['unique_task_id'].isin(tasks_for_set_2)]
data_set_3_of_3 = data[data['unique_task_id'].isin(tasks_for_set_3)]


In [None]:
# Exporting the full data df and the equi-length chronologically split dfs to CSVs

data.to_csv('./filtered_data/dataset_full_filteredasignees.csv')
data_set_1_of_3.to_csv('./filtered_data/dataset_1_of_3_filteredasignees.csv')
data_set_2_of_3.to_csv('./filtered_data/dataset_2_of_3_filteredasignees.csv')
data_set_3_of_3.to_csv('./filtered_data/dataset_3_of_3_filteredasignees.csv')