In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the TSV file obtained from exporting the submission
file_path = 'submissions.tsv'
df = pd.read_csv(file_path, delimiter="\t")

# Display the first few rows of the data
df.head()


In [None]:
import json

def clean_json(x):
    "Create apply function for decoding JSON"
    return json.loads(x.replace("'",'"'))

# parse JSON from Status Durations field
df['Status Durations Parsed'] = df['Status Durations'].apply(clean_json)

In [None]:
# Get set of all statuses.  Each row has key value pairs of status and total seconds spent in that status.  So we need to get the unique set of all such keys.
statuses = set(x for xs in df['Status Durations Parsed'].agg(lambda x: set(x.keys())) for x in xs)
status_cols = ['Status {}'.format(s) for s in statuses]
for s in statuses:
    df['Status {}'.format(s)] = df['Status Durations Parsed'].apply(lambda x: x.get(s))
df.head()

In [None]:
# Group data by type, and only retain the status columns, divided by the number of seconds in an hour
type_statuses = df.groupby('Type')[['Status {}'.format(s) for s in statuses]].agg('mean')/3600
type_statuses

In [None]:
# Plot the durations (in hours) for each type, and each status within that type
type_statuses.T.plot.bar(subplots=True, figsize=(15, 50))

In [None]:
# For each type, plot only the statuses which are used, and do so in descending order of duration to see what takes the longest.
for t,cols in type_statuses.iterrows():
    sorted_cols = cols.dropna().sort_values(ascending=False)
    if len(sorted_cols) > 0:
        plot = sorted_cols.plot(kind='bar', y='all', title=t, figsize=(15, 10))
        plt.xticks(rotation=45)
        plt.xlabel("Status")
        plt.ylabel("Duration (hours)")
        plt.show()