# A Minute of Your Time: Data Analysis

In [None]:
import datetime
import itertools

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from scripts import azure_repos, time_helpers

## Load the data

In [None]:
# Set this to the location of your data file
data_file_location = '../../data/mock-data.json'

In [None]:
pull_requests = azure_repos.load_data(data_file_location)

In [None]:
pull_requests.head()

In [None]:
pull_requests.dtypes

## Exploratory data analysis

### What's in the data?

In [None]:
# What range of data do we have?
first_merge = pull_requests['merged_time'].min()
last_merge = pull_requests['merged_time'].max()
print(f"Data goes from {first_merge.date()} to {last_merge.date()}.")

In [None]:
# Is any data missing?
pull_requests.isna().any()

In [None]:
# Breakdown by author
pull_requests.groupby('author')['ttl'].describe().head()

In [None]:
# Who completed the most PRs?
ttl_by_author = pull_requests.groupby('author')['ttl']
ttl_by_author.size().nlargest(5)

### How is the data distributed?

In [None]:
# Breakdown of all PR completion times
pull_requests['ttl'].describe(percentiles=[.25, .5, .75, .95])

In [None]:
# Breakdown of PRs completed in under an hour
pull_requests['ttl'][lambda x: x < datetime.timedelta(hours=1)].describe()

In [None]:
# Breakdown of PRs completed in over 5 days
pull_requests['ttl'][lambda x: x > datetime.timedelta(days=5)].describe()

In [None]:
# Histogram of completion time
plt.figure()

pull_requests['ttl'].apply(time_helpers.timedelta_to_hours).plot.hist(bins=100)

plt.xlabel('Time to complete PR (hours)')
plt.xlim([0, 10 * 24])

ticks = np.arange(0, 10 * 24, step=24)
labels = [
    f"{i // 24} day{'s' if i // 24 != 1 else ''}"
    for i in ticks
]
plt.xticks(ticks, labels, rotation=90)

plt.show()

In [None]:
# Histogram buckets of 1 hour, up through 5 days
plt.figure()

days = 5
intervals = pd.interval_range(
    start=datetime.timedelta(0),
    end=datetime.timedelta(days=days) + datetime.timedelta(hours=1),
    freq=datetime.timedelta(hours=1))
ttl_under_days = pull_requests['ttl'][lambda x: x < datetime.timedelta(days=days)]
ttl_under_days_bins = pd.cut(ttl_under_days, bins=intervals)
ttl_under_days_hist = ttl_under_days_bins.value_counts(sort=False)
ttl_under_days_hist.plot.bar(width=1)

plt.xlabel('Time to complete PR')
ticks = np.arange(0, 5 * 24, step=8)
labels = [
    f"{i // 24} day{'s' if i // 24 != 1 else ''}"
    if i % 24 == 0
    else f"{i} hours"
    for i in ticks
]
plt.xticks(ticks, labels)

plt.ylabel('# PRs completed')

plt.title('Distribution of PR completion times')

plt.show()

In [None]:
# What's the mode?
ttl_under_days_bins.mode()[0]

In [None]:
# What's this distribution look like?
plt.figure()

amplitude = 0.5
frequency = 0.4
angular_frequency = 2 * np.pi * frequency
phase_angle = np.pi / 2
decay_constant = 0.5

xs = np.arange(0.01, 11, 0.01)
ys = (np.e ** (-decay_constant * xs)) * (1 + amplitude * np.sin(angular_frequency * xs + phase_angle))

plt.plot(xs, ys)
plt.plot(xs, np.zeros(xs.size), '_k')
plt.xlim([0, 10])
plt.ylim([-2, 2])

plt.title('Oscillating decay')

plt.show()

In [None]:
# We can use the distribution to assign crude probabilities to completion time
intervals = pd.interval_range(
    start=datetime.timedelta(0),
    end=pull_requests['ttl'].max() + datetime.timedelta(hours=1),
    freq=datetime.timedelta(hours=1))

intervals_to_num_completed = pd.DataFrame(
    pd.cut(pull_requests['ttl'], bins=intervals).value_counts(sort=False)
)

intervals_to_num_completed['cumulative'] = intervals_to_num_completed['ttl'].cumsum()
intervals_to_num_completed['cumulative_probability'] = intervals_to_num_completed['cumulative'] / pull_requests['ttl'].size

In [None]:
plt.figure()

max_hours = time_helpers.timedelta_to_hours(pull_requests['ttl'].max())
hours = np.arange(0, max_hours, step=1)

plt.bar(hours, intervals_to_num_completed['cumulative_probability'], width=1)

plt.xlim([0, 5 * 24])
plt.xlabel('Time to complete PR')
ticks = np.arange(0, 5 * 24, step=8)
labels = [
    f"{i // 24} day{'s' if i // 24 != 1 else ''}"
    if i % 24 == 0
    else f"{i} hours"
    for i in ticks
]
plt.xticks(ticks, labels, rotation=90)

plt.ylabel('Probability of completed PR')

plt.title('Cumulative distribution of PR completion times')

plt.show()

### How is the data correlated?

In [None]:
# Plot completion time vs. number of reviewers
plt.figure()

xs = pull_requests['num_reviewers']
ys = pull_requests['ttl'].apply(time_helpers.timedelta_to_hours)

plt.plot(xs, ys, 'bo')

plt.xlabel('Number of reviewers')

ticks = np.arange(0, 10 * 24, step=24)
labels = [
    f"{i // 24} day{'s' if i // 24 != 1 else ''}"
    for i in ticks
]
plt.yticks(ticks, labels)
plt.ylim([0, 10 * 24])
plt.ylabel('Time to complete PR')

plt.show()

In [None]:
# Plot completion time vs. number of iterations
plt.figure()

xs = pull_requests['num_iterations']
ys = pull_requests['ttl'].apply(time_helpers.timedelta_to_hours)

plt.plot(xs, ys, 'ro')

plt.xlabel('Number of iterations')

ticks = np.arange(0, 10 * 24, step=24)
labels = [
    f"{i // 24} day{'s' if i // 24 != 1 else ''}"
    for i in ticks
]
plt.yticks(ticks, labels)
plt.ylim([0, 10 * 24])
plt.ylabel('Time to complete PR')

plt.show()