In [44]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data Acquisition
We the list of relevant articles from the Google Drive API. Then, using that list, we query Wikipedia's web API for information on each article's page views at the month granularity. No manual intervention is required. 

In [24]:
# Get the dinosaur CSV
import utils

import pandas as pd
import json
from unidecode import unidecode

def gen_file_name(id, extension='json'): # TODO move to utils
    t = utils.ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE
    return "dino_monthy_" + id + "_" + t['start'] + t['end'] + "." + extension

url = 'https://docs.google.com/spreadsheets/d/1zfBNKsuWOFVFTOGK8qnTr2DmHkYK4mAACBKk1sHLt_k/export?format=csv'
utils.download_csv(url, 'data/dinosaur.csv')

In [25]:
# Get what we need from Wikipedia


# Read the CSV
df = pd.read_csv('data/dinosaur.csv', encoding='utf-8')

#access_types = ['all-access', 'desktop']
access_types = ['mobile-app', 'mobile-web', 'desktop']
for access_type in access_types:
    
    json_accum = [] 
    for _, row in df.iterrows():
        template = utils.ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE
        template['access'] = access_type

        # Get the pageviews
        views = utils.request_pageviews_per_article(
            unidecode(row['name']), 
            request_template=template)

        for month in views['items']:
            json_accum.append(month)

    file_name = "data/" + gen_file_name(access_type)
    with open(file_name, 'w', encoding='utf8') as f:
        f.write(json.dumps(json_accum))


## Data Processing
Here, we convert the downloaded json documents into CSVs ingestible by Pandas' dataframes. All data is rendered to the data/ directory within this repository. There isn't much to do. According web service's documentation, an Article will have an associated View count for everything temporal increment supported for as long as that Article has exist. No manual intervention is required. 

We combine the two mobile data sets, mobile-app and mobile-web into a single dataset, mobile. 

We also render a cumulative dataset composed of all access types. 

In [148]:
from datetime import datetime

def read_json(access_type):
    df = pd.read_json("data/" + gen_file_name(access_type), convert_dates=False)
    df['timestamp_conv'] = pd.to_datetime(df['timestamp'], format='%Y%m%d%H')
    return df

# Create desktop csv
read_json('desktop').to_csv("data_clean/desktop.csv") # TODO: create single data dir

# Create mobile csv
app_df = read_json('mobile-app')
web_df = read_json('mobile-web')
pd.concat([app_df, web_df]).to_csv("data_clean/mobile.csv")

# Create the cumulative csv
def read_csv(file_name):
    return pd.read_csv(file_name).sort_values(by=['article', 'timestamp'])

df_mobile = read_csv("data_clean/mobile.csv")
df_desktop = read_csv("data_clean/desktop.csv")

df_mobile['access'] = 'mobile'

df_mobile.to_json("data/" + gen_file_name("mobile"), orient='records')

df_all = pd.concat([df_mobile, df_desktop])
df_all.groupby(['article', 'access']).cumsum().reset_index().to_json("data/" + gen_file_name("cumulative"), orient='records')




### Analysis 1
Maximum Average and Minimum Average - The first graph should contain time series for the articles that have the highest average page requests and the lowest average page requests for desktop access and mobile access. Your graph should have four lines (max desktop, min desktop, max mobile, min mobile).

In [145]:
df_mob = df_mobile.copy()
df_des = df_desktop.copy()

# Plot maximum and minimum averages
def get_min_max(df):
    avgs = df.groupby('article')['views'].mean().sort_values()
    return [avgs.index[0], avgs.index[len(avgs) - 1]]

mob_min_max = get_min_max(df_mob)
des_min_max = get_min_max(df_des)

# rolling average of views
df_des['avg_views'] = df_des.groupby(['article'])['views'].rolling(4).mean().reset_index(0, drop=True)
df_mob['avg_views'] = df_mob.groupby(['article'])['views'].rolling(4).mean().reset_index(0, drop=True)

df_ext = pd.concat([df_mob[df_mob['article'].isin(mob_min_max)], df_des[df_des['article'].isin(des_min_max)]]).reset_index()

df_ext['gkey'] = df_ext[["article", "access"]].apply(tuple, axis=1)

import plotly.express as px

fig = px.line(
    df_ext, 
    x="timestamp_conv", 
    y="avg_views", log_y=True,
    color="gkey",
    labels=dict(timestamp_conv="Year", avg_views="Number of Article Views (rolling 3-month average)", gkey="(Article, Accesstype)"),
    title='Pageviews for the most and least popular dinosaurs')

fig.show()


### Analysis 2
Top 10 Peak Page Views - The second graph should contain time series for the top 10 article pages by largest (peak) page views over the entire time by access type. You first find the month for each article that contains the highest (peak) page views, and then order the articles by these peak values. Your graph should contain the top 10 for desktop and top 10 for mobile access (20 lines).

In [146]:
# Get top 10 articles by peak view
mob_top10 = df_mob.groupby('article')['views'].max().sort_values(ascending=False).head(10).index
pos_top10 = df_des.groupby('article')['views'].max().sort_values(ascending=False).head(10).index

df_tops = pd.concat([df_mob[df_mob['article'].isin(mob_top10)], df_des[df_des['article'].isin(pos_top10)]]).reset_index()

df_tops['gkey'] = df_tops[["article", "access"]].apply(tuple, axis=1)

fig = px.line(
    df_tops, 
    x="timestamp_conv", y="avg_views", log_y=True,
    color="gkey",
    labels=dict(timestamp_conv="Year", avg_views="Number of Article Views (rolling 3-month average)", gkey="(Article, Accesstype)"),
    title='Top 10 Articles by Peak View for Mobile and Desktop')

fig.update_traces(line=dict(width=0.9), opacity=0.9)
fig.show()

### Analysis 3

Fewest Months of Data - The third graph should show pages that have the fewest months of available data. These will all be relatively short time series, some may only have one month of data. Your graph should show the 10 articles with the fewest months of data for desktop access and the 10 articles with the fewest months of data for mobile access.

In [147]:
mob_bot10 = df_mob.groupby('article')['views'].count().sort_values(ascending=True).head(10).index
pos_bot10 = df_des.groupby('article')['views'].count().sort_values(ascending=True).head(10).index

df_bots = pd.concat([df_mob[df_mob['article'].isin(mob_bot10)], df_des[df_des['article'].isin(pos_bot10)]]).reset_index()
df_bots['gkey'] = df_bots[["article", "access"]].apply(tuple, axis=1)

fig = px.line(
    df_bots, 
    x="timestamp_conv", y="views", log_y=True,
    color="gkey",
    markers=True,
    labels=dict(timestamp_conv="Year", views="Number of Article Views", gkey="(Article, Accesstype)"),
    title='Articles with Fewest Months Available')

fig.update_traces(line=dict(width=1), opacity=0.9)
fig.show()
