# insert disaster pageviews into mariadb

##### password, imports, mariadb_login

###### password

In [143]:
mysql_user = 'ubuntu'
# mysql_pass = input(f'Enter the MySQL password for user {mysql_user}: ')

###### imports

In [147]:
import os, requests, gzip, pickle, io, logging, inspect, functools
from IPython.display import clear_output
import pandas as pd, datetime as dt
import mysql.connector as mysql, sqlalchemy

###### login to mariadb

In [148]:
def connect_mariadb(host='localhost', user=mysql_user, passwd=mysql_pass, dbname='jawiki'):
    """
    connect to mariadb and return: 
        cxn, cur, engine, conn
    """
    cxn = mysql.connect(host=host,user=user,passwd=passwd, database=dbname)
    cur = cxn.cursor()

    connection_str = 'mysql+mysqlconnector://'+user+':'+passwd+'@'+host+'/'+dbname  # removed this after host +':'+dbport
    try:
        engine = sqlalchemy.create_engine(connection_str)
        conn = engine.connect()
    except Exception as e:
        print('Database connection error - check creds')
        print(e)
    return cxn, cur, engine, conn

cxn, cur, engine, conn = connect_mariadb()

## get page_titles and urls

###### unpickle list of disaster pageids (diz_pageids)

In [10]:
def failed_decode(a):
    try:
        a.decode('utf-8')
    except UnicodeDecodeError:
        return True
    return False

def bytearray_to_str(a:bytearray, encoding='utf-8') -> str:
    if type(a) != bytearray:
        return a        
    while failed_decode(a):
        a = a[:-1]
    return str(a.decode(encoding))

In [11]:
with open('../data/processed/jawiki/' + 'disaster_descendants_raw.pickle', 'rb') as f:
    disaster_descendants_raw = pickle.load(f)

In [12]:
disaster_cat_page_ids = {'火山災害':2390743, '熱帯低気圧':626482, '雪害':2390774, '地震':135264, '津波':765772}  # '自然災害':137069, 
disasters_english = {'火山災害':'VolcanicDisaster', '熱帯低気圧':'TropicalCyclones', '雪害':'SnowDamage', '地震':'Earthquake', '津波':'Tsunami'}
disasters = list(disaster_cat_page_ids.keys())

In [13]:
d = {}
for i in disaster_descendants_raw:
    d[i] = (disaster_descendants_raw[i]
            .drop_duplicates(subset='id')
            .applymap(bytearray_to_str)
           )
    d[i] = d[i][d[i].namespace == 0]
    d[i]['page_title'] = d[i].name.map(lambda x: str(x).split(sep='\n')[-1])
disaster_descendants = d
del d, disaster_descendants_raw

In [14]:
disaster_descendants['火山災害'].columns

Index(['id', 'name', 'type', 'namespace', 'page_title'], dtype='object')

In [101]:
diz_pageids = [j for i in disaster_descendants for j in disaster_descendants[i].id]

###### make dict pageid_sorter:  

keys: all pageids in jawiki  
values: disaster 1, not-disaster 0

In [None]:
def make_pageid_sorter(pid_allowlist:list[int], con=conn) -> dict:
    ct = 0
    sql = "select distinct page_id from page;"
    all_pageids = pd.read_sql(sql, conn).squeeze().to_list()
    d = {i:0 for i in all_pageids}
    for i in pid_allowlist:
        d[i] = 1
    return d

In [174]:
pageid_sorter_path = '../data/processed/jawiki/pageid_sorter.pickle'

In [175]:
with open(pageid_sorter_path, 'rb') as f:
    pageid_sorter = pickle.load(f)

In [167]:
len(all_pageids)

3896661

## functions

###### function get_all_paths_in_year

In [111]:
def get_all_paths_in_year(year:int) -> list[str]:
    gen = os.walk(f'../data/temp/jawiki_pageviews/{year}/')
    filepaths = []
    for tup in gen:
        for f in tup[2]:
            filepaths.append(tup[0]+'/'+f)
    filepaths.sort()
    return filepaths

###### function get_pageview_series_by_pageids_years

In [233]:
# HAVEN'T IMPLEMENTED THE DATAFRAME WITH DIFFERENT ACCESS_METHOD FIELDS YET - HALFWAY DONE
def get_pageviews_by_pageid_years(
        pids_of_interest:list[int], years:[int]
        ) -> pd.DataFrame:
    """
    INPUTS: 
        pid_of_interest: pageid to return
        years: list of years to include
    OUTPUTS: 
        pd.DataFrame where:
            index: utc_date in yyyymmdd
            columns: 'mobile', 'desktop', 'app'
            values: encoded hourly counts (see note below)
            name: pageid
    NOTE:
        hourly counts are encoded with letters as hours, and numbers as counts
        for example, 'C2' means 2 pageviews between 02:00 and 02:59
    """
    paths = []
    for year in years:
        paths = paths + get_all_paths_in_year(year)
    pageid_sorter = make_pageid_sorter(pids_of_interest)
    diz_views = {}
    for path in paths[:20]:
        yyyymmdd = path.split('/')[-1].split('-')[1]
        with open(path) as f:
            try: 
                diz_views[yyyymmdd]
            except KeyError:
                diz_views[yyyymmdd] = {}
            while (line := f.readline()):
                line_split = line.split()
                if len(line_split) < 6:
                    continue
                pid_str = line_split[2]
                try: # ignore pageviews of redirects etc that don't have a page_id
                    pid = int(pid_str)
                except ValueError:
                    continue 
                try:
                    (ps:=pageid_sorter[pid])
                except KeyError:
                    continue
                if ps==1:
                    access_method = line_split[3]
                    hour_counts = line_split[5]
                    diz_views[yyyymmdd][pid_str+'_'+access_method] = hour_counts
        clear_output(wait=True)
        print(f"started {min(years)},\ncompleted {yyyymmdd},\ncontinuing until end of {max(years)}")
    return pd.DataFrame(diz_views).T

In [189]:
earthquake_pids = [18508, 159816, 1051, 2339185, 1516544]

In [212]:
earthquake_pids

[18508, 159816, 1051, 2339185, 1516544]

In [234]:
earthquake_pageviews = get_pageviews_by_pageid_years([1051], [2016])

started 2016,
completed 20160120,
continuing until end of 2016


In [264]:
df = earthquake_pageviews

In [341]:
AZ = string.ascii_uppercase
ja_tz_map = {i:j for i,j in zip(AZ[17:]+AZ[:17], AZ[:17]+AZ[17:])}
ja_tz_map = AZ.maketrans(ja_tz_map)
del AZ

In [342]:
a = df['1051_desktop'].str.extract('([R-Z].*$)').shift(1).fillna('')
b = df['1051_desktop'].str.extract('((?:[A-Q][0-9]*)*)').fillna('')
c = (a+b).squeeze().str.translate(ja_tz_map)
c

20160101    J3K4L4M2N5O5P8Q5R8S7T4U4V2W4X6Y4Z2
20160102                      A3B1E2F1G2K1T1W1
20160103                              A1N1W1X1
20160104                                  J1L1
20160105                                    O2
20160106                            L1N1Q1V1X1
20160107                              F1M1U1X1
20160108                              J1O1P1Z2
20160109                              M1O3U6Y1
20160110                                    R1
20160111                                    P1
20160112                          J1K1L3N2U1V1
20160113                            J4K4N2P1Q1
20160114                                L1M1P3
20160115                                  K2O1
20160116                                  K1P1
20160117                              F1L1M2Q1
20160118                                  P1S1
20160119                                    B1
20160120                              B1L1P1R1
Name: 0, dtype: object

In [176]:
with open(datapath) as f:
    print(f.readline())

ja.wikipedia ! 124376 desktop 2 L1S1



###### functions to put pageviews into SQL

In [19]:
def does_table_exist(tablename:str, dbname:str='jawiki', con=conn) -> bool:
    """
    run SQL query to look for tablename, return boolean
    """
    sql = f"""
    SELECT * 
        FROM information_schema.tables
    WHERE table_schema = '{dbname}' 
        AND table_name = '{tablename}'
    LIMIT 1;
    """
    return bool(pd.read_sql(sql, con).shape[0])

In [21]:
def create_pageviews_table(con=conn):
    """
    run SQL code to create pageviews table
    """
    
    sql = """
    CREATE TABLE pageviews (
        row_id BIGINT(20) AUTO_INCREMENT PRIMARY KEY
        ,page_id BIGINT(20)
        ,utc_date DATE
        ,utc_hourly_count TEXT
    )
    ;
    """
    conn.execute(sql)

## main program

In [None]:
def etl_pageviews_by_years_and_projects(years:list[int], 
                                        pagetitles:list[str],
                                        project:str='ja', 
                                        logfilepath:str=logfilepath, 
                                        con=conn):
    if not does_table_exist('pageviews'):
        create_pageviews_table(con)
    # start counts
    process_start_time = dt.datetime.now()
    file_count = 0
    for year in years:
        urls = get_pageviews_urls_by_year(year)
        # urls = urls[:3] # truncate if debugging 
        for url in urls:
            
            # actual work
            temp_fpath = download_file(url, dirpath='../data/temp/')
            pageviews = get_pageviews_subset_by_proj_and_pagetitles(
                            temp_fpath, pagetitles=pagetitles)
            pageviews.to_sql(name='pageviews',con=con, if_exists='append', index=False)
            
            # complete counts
            file_count += 1
            # log and cleanup
            log_forecast_of_completion(file_count, process_start_time, years)
            os.remove(temp_fpath)

---

run main program

In [None]:
etl_pageviews_by_years_and_projects([2016,2017,2018,2019,2020,2021], all_titles)

---

### probably won't need this concept

###### function tosql_utcdate_pageid_hourlycount_accessmethod

In [133]:
def populate_pageviews_utc(paths:list[str]):
    # bz2 fields: wiki_code, article_title, page_id, daily_total, hourly_counts
    # sql fields: utc_date, page_id, hourly_count, access_method
    
    # check the line starts with "ja.wikipedia"
    # split the line
    # skip record if the list is too short
    # skip record if there's no integer pageid
    # convert access_method to integer:
        # 'mobile': 1, 'desktop': 2, 'app': 3
    # insert record to sql    
    pass


## get it working

In [None]:
datapath = '../data/temp/jawiki_pageviews/2016/2016-01/pageviews-20160102-user'

In [None]:
volc_ids = disaster_descendants['火山災害'].id.tolist()

In [None]:
def getline(n):    
    with open(datapath) as f:
        for i in range(n-1):
            line = f.readline()
        print(f.readline())

In [96]:
def countlines(path):
    ct = 0
    with open(path) as f:
        while True:
            try:
                line = f.readline()
            except:
                break
    return ct

In [None]:
getline(400243)

In [None]:
title, line

In [None]:
ct_pids, ct_diz

---