# get pageviews

### intro

#### setup

###### imports

In [1]:
import pandas as pd, numpy as np, os, re, pickle, requests, time, gzip
import mysql.connector as mysql, sqlalchemy
from datetime import datetime as dt, timedelta
import datetime

###### fonts

https://github.com/linzino7/matplotlibChinesefix

In [2]:
from matplotlib.font_manager import findfont, FontProperties  
findfont(FontProperties(family=FontProperties().get_family())) 

'/home/bhrdwj/anaconda3/envs/py39/lib/python3.9/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf'

In [3]:
import matplotlib 
matplotlib.matplotlib_fname()

'/home/bhrdwj/anaconda3/envs/py39/lib/python3.9/site-packages/matplotlib/mpl-data/matplotlibrc'

In [4]:
import matplotlib.pyplot as plt 
plt.rcParams['font.sans-serif'] = ['Noto Sans CJK JP']  
plt.rcParams['axes.unicode_minus'] = False 

###### paths

In [2]:
rawdumps_path = '../data/raw/jawiki/dumps_unzipped/'
processed_path = '../data/processed/jawiki/'
twitter_path = '../data/raw/twitter/'
pageviews_path = '../data/raw/pageviews/'

###### end

#### unpickle

##### disaster_descendants from disaster_descendants_raw

###### unpickle ```disaster_descendants_raw```

In [3]:
with open(processed_path + 'disaster_descendants_raw.pickle', 'rb') as f:
    disaster_descendants_raw = pickle.load(f)

###### function bytearray_to_str

In [4]:
def failed_decode(a):
    try:
        a.decode('utf-8')
    except UnicodeDecodeError:
        return True
    return False

def bytearray_to_str(a:bytearray, encoding='utf-8') -> str:
    if type(a) == str:
        return a
    elif type(a) != bytearray:
        raise ValueError("The passed argument wasn't a string nor a bytearray.")
    while failed_decode(a):
        a = a[:-1]
    return str(a.decode(encoding))

###### get cleaned ```disaster_descendants``` 

by:  
- dropping rows with duplicate category name for the same category page_id
- filtering out non-content-page records
- and healing corrupted bytearrays

In [5]:
d = {}
for i in disaster_descendants_raw:
    d[i] = disaster_descendants_raw[i].drop_duplicates(subset='id')
    d[i] = d[i][d[i].namespace == 0]
    d[i]['name'] = d[i].name.map(bytearray_to_str)
disaster_descendants = d
del d

##### -----

#### references

###### links

- https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Traffic/Pageviews#dumps.wikimedia.org
- https://meta.wikimedia.org/wiki/Research:Page_view
- https://dumps.wikimedia.org/other/pageviews
- https://dumps.wikimedia.org/other/pageviews/readme.html
- https://pageviews.wmcloud.org/?project=ja.wikipedia.org&platform=all-access&agent=user&redirects=0&start=2019-11-01&end=2022-02-28&pages=%E7%81%AB%E5%B1%B1%E7%81%BD%E5%AE%B3|%E7%86%B1%E5%B8%AF%E4%BD%8E%E6%B0%97%E5%9C%A7|%E5%9C%B0%E9%9C%87|%E9%9B%AA%E5%AE%B3|%E6%B4%A5%E6%B3%A2

###### -----

## import pageview hourly dumpfile

##### download dumpfile

In [6]:
def download_file(url, dirpath='./'):
    local_filename = dirpath + url.split('/')[-1]
    # NOTE the stream=True parameter below
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=131072):   #8192
                # If you have chunk encoded response uncomment if
                # and set chunk_size parameter to None.
                #if chunk: 
                f.write(chunk)
    return local_filename
# https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests

##### extract pageviews from hour-file gzip archive

###### function ```open_hour_file``` to provide the unzipped file to python

In [76]:
def open_hour_file(path:str):
    if path[-3:] == ".gz":
        return gzip.open(path, mode="rt", encoding="utf-8", errors="replace")
    else:
        return open(path, mode="rt", encoding="utf-8", errors="replace")
# https://github.com/mediawiki-utilities/python-mwviews/blob/main/src/mwviews/utilities/aggregate.py

###### function ```get_pageviews_by_pageids``` to clean, split, and filter pageview records

In [35]:
def get_pageview_records_by_disaster(f, page_titles, proj='ja'):
    records = []
    while (line := f.readline()):
        if line[:3] != proj + ' ':
            continue
        lst = line.strip().split(sep=' ')
        if lst[1] in page_titles:
            records.append(lst)
    return records

###### function ```pageview_records_to_dataframe```

In [53]:
def pageview_records_to_dataframe(records):
    pageviews_colnames = ['domain_code', 'page_title','count_views','total_response_size']
    pgv = {}
    for i in records:
        df = pd.DataFrame(records[i], columns=pageviews_colnames)
        df['page_title'] = df.page_title.map(lambda x: x.split("#")[0])  # No anchors
        df['count_views'] = df.count_views.astype(int)
        df['total_response_size'] = df.total_response_size.astype(int)
        pgv[i] = df
    return pgv

###### get list of page_titles by disaster

In [73]:
page_titles = {}
for i in disaster_descendants:
    page_titles[i] = (
        disaster_descendants[i].name
                .map(lambda x: x.split(sep='\n')[-1])
                .unique()
                .tolist()
    )

##### run

###### get pageview records as lists of lists

In [80]:
pageview_records = {}
for i in page_titles:
    with open_hour_file(pageviews_path + 'pageviews-20191101-020000.gz') as f:
        pageview_records[i] = get_pageview_records_by_disaster(f, page_titles[i])    

###### turn pageview records into dataframes

In [87]:
pageviews_colnames = ['domain_code', 'page_title','count_views','total_response_size']
pgv = {}
for i in pageview_records:
    df = pd.DataFrame(pageview_records[i], columns=pageviews_colnames)
    df['page_title'] = df.page_title.map(lambda x: x.split("#")[0])  # No anchors
    df['count_views'] = df.count_views.astype(int)
    df['total_response_size'] = df.total_response_size.astype(int)
    pgv[i] = df

###### check it

In [106]:
pgv[i]

Unnamed: 0,domain_code,page_title,count_views,total_response_size
0,ja,100%…SOかもね!,1,0
1,ja,105人事件,1,0
2,ja,175R,3,0
3,ja,1883年のクラカタウの噴火,1,0
4,ja,1978年自由民主党総裁選挙,1,0
...,...,...,...,...
2953,ja,黒田清隆,6,0
2954,ja,黒田祥子,2,0
2955,ja,黛ジュン,2,0
2956,ja,齋藤内閣,1,0


###### -----

##### assess raw pageviews dumpfile for importing

###### import raw lines in jawiki project

In [25]:
lines = []
with open(pageviews_path + 'pageviews-20191002-080000', 'r') as f:
    while (line:=f.readline()):
        if line[:2] == 'ja':
            lines.append(line)

###### peek

In [26]:
display(lines[0],lines[1], lines[2], lines[400], lines[4000], lines[40000])

'ja !!! 1 0\n'

'ja "E"qual 2 0\n'

'ja "Hello,_world." 1 0\n'

'ja 10月7日 15 0\n'

'ja AC-130 2 0\n'

'ja わが心にかくも愛しき 1 0\n'

###### check line count and line lengths

In [None]:
len(lines)

In [None]:
max(map(lambda x: len(x.split(sep=' ')), lines))

###### end

##### import to pandas

###### import

In [None]:
pageviews_dumps_column_names = ['domain_code', 'page_title','count_views','total_response_size']
pgv = pd.DataFrame(
    map(lambda x: x.split(sep=' '), lines)
    ,columns = pageviews_dumps_column_names
)
pgv['page_title'] = pgv.page_title.map(lambda x: x.split("#")[0])  # No anchors
pgv['count_views'] = pgv.count_views.astype(int)
pgv['total_response_size'] = pgv.total_response_size.astype(int)

###### peek

In [None]:
pgv.loc[[0,1,2,400,4000,40000]]

###### -----

##### -----

### -----

## -----