In [None]:
import collections
import json
import io
import pathlib
import requests
import zipfile

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
ROOT_DIR = pathlib.Path("..")

DATASETS_URL = "https://github.com/KAUST-Academy/python-for-data-analysis/raw/november-2022/datasets.zip"
EXAMPLES_URL = "https://github.com/KAUST-Academy/python-for-data-analysis/raw/november-2022/examples.zip"

response = requests.get(DATASETS_URL)
z = zipfile.ZipFile(io.BytesIO(response.content))
z.extractall(ROOT_DIR)


response = requests.get(EXAMPLES_URL)
z = zipfile.ZipFile(io.BytesIO(response.content))
z.extractall(ROOT_DIR)

In [None]:
DATASETS_DIR = ROOT_DIR / "datasets"
EXAMPLES_DIR = ROOT_DIR / "examples"

In [None]:
DATASET_DIR = DATASETS_DIR / "bitly_usagov"

In [None]:
path = DATASET_DIR / "example.txt"

In [None]:
with open(path) as f:
    records = [json.loads(line) for line in f]

In [None]:
time_zones = [rec["tz"] for rec in records]

In [None]:
time_zones = [rec["tz"] for rec in records if "tz" in rec]
time_zones[:10]

In [None]:
def get_counts(sequence):
    counts = {}
    for x in sequence:
        if x in counts:
            counts[x] += 1
        else:
            counts[x] = 1
    return counts

In [None]:
from collections import defaultdict

def get_counts2(sequence):
    counts = defaultdict(int) # values will initialize to 0
    for x in sequence:
        counts[x] += 1
    return counts

In [None]:
counts = get_counts(time_zones)
counts["America/New_York"]
len(time_zones)

In [None]:
def top_counts(count_dict, n=10):
    value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
    value_key_pairs.sort()
    return value_key_pairs[-n:]

In [None]:
top_counts(counts)

In [None]:
counts = collections.Counter(time_zones)
counts.most_common(10)

In [None]:
frame = pd.DataFrame(records)

In [None]:
frame.info()
frame["tz"].head()

In [None]:
tz_counts = frame["tz"].value_counts()
tz_counts.head()

In [None]:
clean_tz = frame["tz"].fillna("Missing")
clean_tz[clean_tz == ""] = "Unknown"
tz_counts = clean_tz.value_counts()
tz_counts.head()

In [None]:
plt.figure(figsize=(10, 4))

In [None]:
subset = tz_counts.head()
#! figure,id=usa_gov_counts,title="Top time zones in the 1.usa.gov sample data"
sns.barplot(y=subset.index, x=subset.to_numpy())

In [None]:
frame["a"][1]
frame["a"][50]
frame["a"][51][:50]  # long line

In [None]:
results = pd.Series([x.split()[0] for x in frame["a"].dropna()])
results.head(5)
results.value_counts().head(8)

In [None]:
cframe = frame[frame["a"].notna()].copy()

In [None]:
cframe["os"] = np.where(cframe["a"].str.contains("Windows"),
                        "Windows", "Not Windows")
cframe["os"].head(5)

In [None]:
by_tz_os = cframe.groupby(["tz", "os"])

In [None]:
agg_counts = by_tz_os.size().unstack().fillna(0)
agg_counts.head()

In [None]:
indexer = agg_counts.sum("columns").argsort()
indexer.values[:10]

In [None]:
count_subset = agg_counts.take(indexer[-10:])
count_subset

In [None]:
agg_counts.sum(axis="columns").nlargest(10)

In [None]:
plt.figure()

In [None]:
count_subset = count_subset.stack()
count_subset.name = "total"
count_subset = count_subset.reset_index()
count_subset.head(10)
sns.barplot(x="total", y="tz", hue="os",  data=count_subset)

In [None]:
def norm_total(group):
    group["normed_total"] = group["total"] / group["total"].sum()
    return group

results = count_subset.groupby("tz").apply(norm_total)

In [None]:
plt.figure()

In [None]:
sns.barplot(x="normed_total", y="tz", hue="os",  data=results)

In [None]:
g = count_subset.groupby("tz")
results2 = count_subset["total"] / g["total"].transform("sum")