In [44]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Data Acquisition

In [24]:
# Get the dinosaur CSV
import utils

import pandas as pd
import json
from unidecode import unidecode

def gen_file_name(id, extension='json'): # TODO move to utils
    t = utils.ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE
    return "dino_monthy_" + id + "_" + t['start'] + t['end'] + "." + extension

url = 'https://docs.google.com/spreadsheets/d/1zfBNKsuWOFVFTOGK8qnTr2DmHkYK4mAACBKk1sHLt_k/export?format=csv'
utils.download_csv(url, 'data_raw/dinosaur.csv')

In [25]:
# Get what we need from Wikipedia


# Read the CSV
df = pd.read_csv('data_raw/dinosaur.csv', encoding='utf-8')

#access_types = ['all-access', 'desktop']
access_types = ['mobile-app', 'mobile-web', 'desktop']
for access_type in access_types:
    
    json_accum = [] 
    for _, row in df.iterrows():
        template = utils.ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE
        template['access'] = access_type

        # Get the pageviews
        views = utils.request_pageviews_per_article(
            unidecode(row['name']), 
            request_template=template)

        for month in views['items']:
            json_accum.append(month)

    file_name = "data_raw/" + gen_file_name(access_type)
    with open(file_name, 'w', encoding='utf8') as f:
        f.write(json.dumps(json_accum))


In [97]:
from datetime import datetime

def read_json(access_type):
    df = pd.read_json("data_raw/" + gen_file_name(access_type), convert_dates=False)
    df['timestamp_conv'] = pd.to_datetime(df['timestamp'], format='%Y%m%d%H')
    return df

# Create desktop csv
read_json('desktop').to_csv("data_clean/desktop.csv") # TODO: create single data dir

# Create mobile csv
app_df = read_json('mobile-app')
web_df = read_json('mobile-web')
pd.concat([app_df, web_df]).to_csv("data_clean/mobile.csv")

# Create the cumulative csv
def read_csv(file_name):
    return pd.read_csv(file_name).sort_values(by=['article', 'timestamp'])

df_mobile = read_csv("data_clean/mobile.csv")
df_desktop = read_csv("data_clean/desktop.csv")

df_mobile['access'] = 'mobile'

#df_combined = df_desktop.copy()
#df_combined['views'] += df_mobile['views']

#df_combined['cum_views'] = df_combined.groupby('article')['views'].cumsum()
#df_combined.to_csv(gen_file_name('cumulative', 'csv'))

### Analysis 
Maximum Average and Minimum Average - The first graph should contain time series for the articles that have the highest average page requests and the lowest average page requests for desktop access and mobile access. Your graph should have four lines (max desktop, min desktop, max mobile, min mobile).




In [104]:
df_mob = df_mobile.copy()
df_des = df_desktop.copy()

# Plot maximum and minimum averages
def get_min_max(df):
    avgs = df.groupby('article')['views'].mean().sort_values()
    return [avgs.index[0], avgs.index[len(avgs) - 1]]

mob_min_max = get_min_max(df_mob)
des_min_max = get_min_max(df_des)

df_ext = pd.concat([df_mob[df_mob['article'].isin(mob_min_max)], df_des[df_des['article'].isin(des_min_max)]]).reset_index()

print(df_ext[df_ext['article'] == 'Honghesaurus'])

import plotly.express as px

fig = px.line(
    df_ext, 
    x="timestamp_conv", 
    y="views", log_y=True,
    color=df_ext[["article", "access"]].apply(tuple, axis=1),
    title='Pageviews for the most and least popular dinosaurs')

fig.show()


      index  Unnamed: 0       project       article granularity   timestamp  \
0     45172       45172  en.wikipedia  Honghesaurus     monthly  2015070100   
1    157793       45172  en.wikipedia  Honghesaurus     monthly  2015070100   
2     45173       45173  en.wikipedia  Honghesaurus     monthly  2015080100   
3    157794       45173  en.wikipedia  Honghesaurus     monthly  2015080100   
4     45174       45174  en.wikipedia  Honghesaurus     monthly  2015090100   
..      ...         ...           ...           ...         ...         ...   
430   45254       45254  en.wikipedia  Honghesaurus     monthly  2022050100   
431   45255       45255  en.wikipedia  Honghesaurus     monthly  2022060100   
432   45256       45256  en.wikipedia  Honghesaurus     monthly  2022070100   
433   45257       45257  en.wikipedia  Honghesaurus     monthly  2022080100   
434   45258       45258  en.wikipedia  Honghesaurus     monthly  2022090100   

      access agent  views timestamp_conv  
0     mo

### Analysis
Top 10 Peak Page Views - The second graph should contain time series for the top 10 article pages by largest (peak) page views over the entire time by access type. You first find the month for each article that contains the highest (peak) page views, and then order the articles by these peak values. Your graph should contain the top 10 for desktop and top 10 for mobile access (20 lines).

In [112]:
import seaborn as sns

# Get top 10 articles by peak view
mob_top10 = df_mobile.groupby('article')['views'].max().sort_values(ascending=False).head(10).index
pos_top10 = df_desktop.groupby('article')['views'].max().sort_values(ascending=False).head(10).index

df_mobile['access'] = 'mobile'

df_tops = pd.concat([df_mobile[df_mobile['article'].isin(mob_top10)], df_desktop[df_desktop['article'].isin(pos_top10)]]).reset_index()

print(df_tops.head())

fig = px.line(
    df_tops, 
    x="timestamp_conv", y="views", log_y=True,
    color=df_tops[["article", "access"]].apply(tuple, axis=1),
    title='TODO')

fig.show()

    index  Unnamed: 0       project        article granularity   timestamp  \
0     712         712  en.wikipedia  Achelousaurus     monthly  2015070100   
1  113333         712  en.wikipedia  Achelousaurus     monthly  2015070100   
2     713         713  en.wikipedia  Achelousaurus     monthly  2015080100   
3  113334         713  en.wikipedia  Achelousaurus     monthly  2015080100   
4     714         714  en.wikipedia  Achelousaurus     monthly  2015090100   

   access agent  views timestamp_conv  
0  mobile  user      9     2015-07-01  
1  mobile  user    455     2015-07-01  
2  mobile  user      8     2015-08-01  
3  mobile  user    313     2015-08-01  
4  mobile  user      8     2015-09-01  


TypeError: incompatible index of inserted column with frame index

### Analysis

Fewest Months of Data - The third graph should show pages that have the fewest months of available data. These will all be relatively short time series, some may only have one month of data. Your graph should show the 10 articles with the fewest months of data for desktop access and the 10 articles with the fewest months of data for mobile access.