In [None]:
from datetime import datetime
import load_csv
import matplotlib.pyplot as plt  # type: ignore
import numpy as np  # type: ignore
import pandas as pd  # type: ignore
import seaborn as sns  # type: ignore

# Load Data

In [None]:
area_data = load_csv.load_area_data_2018(
    "../../data/area/statistical-area-2-2018-generalised.csv"
)

In [None]:
telco_data = load_csv.load_telco_data("../../data/telco/pop_data_2020-04-01.dat")

In [None]:
data_merged = pd.merge(area_data[["region_code", "region_name", "area"]], telco_data, on="region_code")

In [None]:
data_merged['day of week'] = data_merged['time'].dt.dayofweek
data_merged['day name'] = data_merged['time'].dt.day_name()

In [None]:
data_merged['weekday'] = data_merged['day of week'].apply(lambda x: False if x == 5 or x == 6 else True)

In [None]:
data_merged['density'] = data_merged['count'] / data_merged['area']

In [None]:
data_merged['day'] = data_merged['time'].dt.strftime('%Y-%m-%d')

In [None]:
data_merged

In [None]:
data_merged.dtypes

# Restrict dates

In [None]:
mask_dates = (
    (data_merged['time'] >= datetime(2020, 2, 16)) &
    (data_merged['time'] < datetime(2020, 3, 1))
)
mask_weekday = data_merged['day of week'].isin(range(0, 5))
mask_weekend = data_merged['day of week'].isin(range(5, 7))

In [None]:
data_merged[mask_dates]

In [None]:
data_weekday = data_merged[mask_dates & mask_weekday]
data_weekend = data_merged[mask_dates & mask_weekend]

In [None]:
sns.distplot(data_weekday['day of week'], kde=False)
sns.distplot(data_weekend['day of week'], kde=False)

In [None]:
table = pd.pivot_table(
    data_merged[mask_dates],
    index=["region_code", "region_name",],
#     columns=["day of week", "day name"],
    columns=["weekday"],
    values=['count'],
    aggfunc=[min, max]
)

table.head(20)

# Ratios of min/max counts

In [None]:
def activity_ratio(df):
    table_days = pd.pivot_table(
        df,
        index=["region_code", "region_name", "day"],
    #     columns=["day of week", "day name"],
        values='count',
        aggfunc=[min, max]
    )

    table_days['ratio'] = table_days['max', 'count'] / table_days['min', 'count']
    table_days['delta'] = table_days['max', 'count'] - table_days['min', 'count']

    # Drop infinities and nans
    # TODO: shouldn't drop all the days for a location
    table_days = table_days[np.isfinite(table_days).all(1)]

    table_mean_ratio = pd.pivot_table(
        table_days.reset_index(),
        index=["region_code", "region_name"],
        values=['ratio', 'delta'],
        aggfunc=np.mean
    )
    
    return table_mean_ratio





In [None]:
weekday_mean_ratios = activity_ratio(data_merged[mask_dates & mask_weekday])
weekend_mean_ratios = activity_ratio(data_merged[mask_dates & mask_weekend])

In [None]:
weekday_mean_ratios.sort_values('delta').tail(10)

In [None]:
weekend_mean_ratios.sort_values('delta').tail(10)

In [None]:
bins = np.linspace(0, 30000, 50)
sns_plot = sns.distplot(
    weekday_mean_ratios['delta'],
    bins=bins,
    kde=False,
    hist_kws={"alpha":0.5},
    label="weekday",
)

sns.distplot(
    weekend_mean_ratios['delta'],
    bins=bins,
    kde=False,
    hist_kws={"alpha":0.5},
    label="weekend",
)

sns_plot.set_yscale("log")
sns_plot.legend()

In [None]:
ratios_merged = pd.merge(weekday_mean_ratios, weekend_mean_ratios, left_index=True, right_index=True, how="outer")
ratios_merged.rename(
    columns={
        "ratio_x":"weekday ratio",
        "ratio_y":"weekend ratio",
        "delta_x":"weekday delta",
        "delta_y":"weekend delta",
    },
    inplace=True
)

In [None]:
ratios_merged.isnull().sum()

In [None]:
sns.scatterplot(
    data=ratios_merged,
    x='weekday delta',
    y='weekend delta',
)

In [None]:
ratios_areas = pd.merge(ratios_merged, area_data, on="region_code", how="inner")
ratios_areas

In [None]:
sns_plot = sns.scatterplot(
    data=ratios_merged,
    x="area",
    y="weekday ratio",
    label="weekday",
    alpha=0.3,
)

sns.scatterplot(
    data=ratios_areas,
    x="area",
    y="weekend_ratio",
    label="weekend",
    alpha=0.3,
)

sns_plot.set_xscale("log")
sns_plot.set_yscale("log")

sns_plot.set_ylabel("mean daily ratio")

In [None]:
ratios_merged

# Diffs

In [None]:
data_diffs = data_merged[mask_dates].copy()
data_diffs.sort_values(["region_code", 'time'], inplace=True)
data_diffs['diff'] = data_diffs.groupby('region_code')['count'].diff()
data_diffs

In [None]:
def daily_diff_means(data):
    table_diff = pd.pivot_table(
        data,
        index=["region_code", "region_name", "day"],
        values="diff",
        aggfunc=[sum, max, min, lambda x: (x**2).mean()**0.5],
    )

    table_diff.columns = table_diff.columns.to_flat_index()
    table_diff

    table_diff = table_diff[np.isfinite(table_diff).all(1)]
    table_means = pd.pivot_table(
        table_diff.reset_index()[mask_weekday],
        index=["region_code", "region_name"],
        values=[('min', 'diff'), ('max', 'diff'), ('<lambda>', 'diff')]
    )
    
    return table_means

In [None]:
table_means_weekday = daily_diff_means(data_diffs[mask_weekday])
table_means_weekend = daily_diff_means(data_diffs[mask_weekend])

In [None]:
bins = np.linspace(0, 3000, 100)
sns_plot = sns.distplot(
    table_means_weekday[('max', 'diff')],
    bins=bins,
    kde=False,
    label="weekday",
)
sns.distplot(
    table_means_weekend[('max', 'diff')],
    bins=bins,
    kde=False,
    label="weekend",
)
sns_plot.legend()
sns_plot.set_yscale("log")

In [None]:
table_means_weekday.sort_values(('max', 'diff'))

In [None]:
data_merged[
    (data_merged['region_name'] == 'Auckland-University') &
    (data_merged['time'] > datetime(2020, 2, 17)) &
    (data_merged['time'] < datetime(2020, 2, 18))
]