# insert disaster pageviews into mariadb

In [2]:
mysql_user = 'ubuntu'
# mysql_pass = input(f'Enter the MySQL password for user {mysql_user}: ')

In [91]:
import os, requests, gzip, pickle, io, logging, inspect, functools
from IPython.display import clear_output
import pandas as pd, datetime as dt
import mysql.connector as mysql, sqlalchemy

In [4]:
todays_date = dt.datetime.now(tz=dt.timezone(dt.timedelta(hours=8))).strftime('%Y-%m-%d')
logfilepath = '../data/logs/pageviews-log_' + todays_date + '_' + str(0) + '.txt'

## setup logging

In [5]:
logger = logging.getLogger('1.22-sfb-get-pageviews-with-AWS')
logger.setLevel(logging.DEBUG)
# create file handler which logs even debug messages
fh = logging.FileHandler(logfilepath)
fh.setLevel(logging.DEBUG)
# create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.ERROR)
# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(fh)
logger.addHandler(ch)

# USAGE: https://docs.python.org/3/howto/logging-cookbook.html

In [6]:
def log_args(func):
    """
    Decorator to print function call details.
    This includes parameters names and effective values.
    """
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        func_args = inspect.signature(func).bind(*args, **kwargs).arguments
        func_args_str = ", ".join(map("{0[0]} = {0[1]!r}".format, func_args.items()))
        logger.info(f"start {func.__module__}.{func.__qualname__} ( {func_args_str} )")
        try:
            return func(*args, **kwargs)
        finally:
            logger.info(f"finish {func.__module__}.{func.__qualname__} ( {func_args_str} )\n")
    return wrapper

In [7]:
def log_simply(func):
    """
    Decorator to print function call details.
    This includes parameters names and effective values.
    """
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        func_args = inspect.signature(func).bind(*args, **kwargs).arguments
        func_args_str = ", ".join(map("{0[0]} = {0[1]!r}".format, func_args.items()))
        logger.info(f"start {func.__module__}.{func.__qualname__}")
        try:
            return func(*args, **kwargs)
        finally:
            logger.info(f"finish {func.__module__}.{func.__qualname__}\n")
    return wrapper

In [8]:
def log_errors(func):
    """
    A decorator that wraps the passed in function and logs 
    exceptions should one occur
    """
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except:
            # log the exception
            err = "There was an exception in  "
            err += func.__name__
            logger.exception(err)

            # re-raise the exception
            raise
    return wrapper
# https://www.blog.pythonlibrary.org/2016/06/09/python-how-to-create-an-exception-logging-decorator/

##### login to mariadb

In [9]:
@log_simply
def connect_mariadb(host='localhost', user=mysql_user, passwd=mysql_pass, dbname='jawiki'):
    """
    connect to mariadb and return: 
        cxn, cur, engine, conn
    """
    cxn = mysql.connect(host=host,user=user,passwd=passwd, database=dbname)
    cur = cxn.cursor()

    connection_str = 'mysql+mysqlconnector://'+user+':'+passwd+'@'+host+'/'+dbname  # removed this after host +':'+dbport
    try:
        engine = sqlalchemy.create_engine(connection_str)
        conn = engine.connect()
    except Exception as e:
        print('Database connection error - check creds')
        print(e)
    return cxn, cur, engine, conn

cxn, cur, engine, conn = connect_mariadb()

## get page_titles and urls

###### unpickle list of pageids

In [10]:
def failed_decode(a):
    try:
        a.decode('utf-8')
    except UnicodeDecodeError:
        return True
    return False

def bytearray_to_str(a:bytearray, encoding='utf-8') -> str:
    if type(a) != bytearray:
        return a        
    while failed_decode(a):
        a = a[:-1]
    return str(a.decode(encoding))

In [11]:
with open('../data/processed/jawiki/' + 'disaster_descendants_raw.pickle', 'rb') as f:
    disaster_descendants_raw = pickle.load(f)

In [12]:
disaster_cat_page_ids = {'火山災害':2390743, '熱帯低気圧':626482, '雪害':2390774, '地震':135264, '津波':765772}  # '自然災害':137069, 
disasters_english = {'火山災害':'VolcanicDisaster', '熱帯低気圧':'TropicalCyclones', '雪害':'SnowDamage', '地震':'Earthquake', '津波':'Tsunami'}
disasters = list(disaster_cat_page_ids.keys())

In [13]:
d = {}
for i in disaster_descendants_raw:
    d[i] = (disaster_descendants_raw[i]
            .drop_duplicates(subset='id')
            .applymap(bytearray_to_str)
           )
    d[i] = d[i][d[i].namespace == 0]
    d[i]['page_title'] = d[i].name.map(lambda x: str(x).split(sep='\n')[-1])
disaster_descendants = d
del d, disaster_descendants_raw

In [14]:
disaster_descendants['火山災害'].columns

Index(['id', 'name', 'type', 'namespace', 'page_title'], dtype='object')

In [15]:
all_pageids = [j for i in disaster_descendants for j in disaster_descendants[i].id]

## functions

###### function get_pageview_series_by_pageids

In [92]:
# @log_simply
def get_pageview_series_by_pageids(
        datapath:str, pid_allowlist:list[int]
        ) -> pd.Series:
    """
    INPUTS: 
        datapath: path to pageviews_complete extract (single project only)
        pid_shortlist: allowlist of pageids (sorted)
    OUTPUTS: 
        pd.Series where:
            index: page_id
            value: encoded hourly counts (see note below)
            name: date in yyyymmdd
    NOTE:
        hourly counts are encoded with letters as hours, and numbers as counts
        for example, 'A2C1' means:
            2 pageviews between 00:00 and 00:59
            1 pageview between 02:00 and 02:59
    """
    diz_views = {}
    ct = 0
    with open(datapath) as f:
        while (line := f.readline()):
            ct+=1
            if ct%1000 == 0:
                clear_output(wait=True)
                print(ct)
            line_split = line.split()
            pid_str = line_split[2]
            hour_counts = line_split[-1]
            try: # ignore pageviews of redirects etc that don't have a page_id
                pid = int(pid_str)
            except ValueError:
                continue 
            if pid in pid_allowlist:
                diz_views[pid] = hour_counts
    yyyymmdd = datapath.split('/')[-1].split('-')[1]
    return pd.Series(diz_views, name=yyyymmdd)

In [58]:
gen = os.walk('../data/temp/jawiki_pageviews/')
filepaths = []
for tup in gen:
    for f in tup[2]:
        filepaths.append(tup[0]+'/'+f)
filepaths.sort()

In [None]:
get_pageview_series_by_pageids(filepaths[0], all_pageids)

# THIS IS TAKING TOO LONG

In [None]:
countlines(filepaths[0])

In [64]:
datapath = filepaths[0]
pid_allowlist = all_pageids
diz_views = {}

In [65]:
f = open(datapath)

In [86]:
line = f.readline()
line

'ja.wikipedia !!!Fuck_You!!! 625023 desktop 1 O1\n'

In [87]:
line_split = line.split()
pid_str = line_split[2]
hour_counts = line_split[-1]

In [88]:
try: # ignore pageviews of redirects etc that don't have a page_id
    pid = int(pid_str)
    print(pid)
except ValueError as a:
    print(f"{type(a)} {a}")

625023


In [89]:
if pid in pid_allowlist:
    diz_views[pid] = hour_counts
    print(f"{pid}: {hour_counts}")
else:
    print("not in pid_allowlist")

not in pid_allowlist


In [None]:
yyyymmdd = datapath.split('/')[-1].split('-')[1]
diz_views = pd.Series(diz_views, name=yyyymmdd)

###### functions to put pageviews into SQL

In [19]:
@log_args
def does_table_exist(tablename:str, dbname:str='jawiki', con=conn) -> bool:
    """
    run SQL query to look for tablename, return boolean
    """
    sql = f"""
    SELECT * 
        FROM information_schema.tables
    WHERE table_schema = '{dbname}' 
        AND table_name = '{tablename}'
    LIMIT 1;
    """
    return bool(pd.read_sql(sql, con).shape[0])

In [21]:
@log_args
def create_pageviews_table(con=conn):
    """
    run SQL code to create pageviews table
    """
    
    sql = """
    CREATE TABLE pageviews (
        row_id BIGINT(20) AUTO_INCREMENT PRIMARY KEY
        ,page_id BIGINT(20)
        ,utc_date DATE
        ,utc_hourly_count TEXT
    )
    ;
    """
    conn.execute(sql)

## main program

In [None]:
@log_errors
def etl_pageviews_by_years_and_projects(years:list[int], 
                                        pagetitles:list[str],
                                        project:str='ja', 
                                        logfilepath:str=logfilepath, 
                                        con=conn):
    if not does_table_exist('pageviews'):
        create_pageviews_table(con)
    # start counts
    process_start_time = dt.datetime.now()
    file_count = 0
    for year in years:
        urls = get_pageviews_urls_by_year(year)
        # urls = urls[:3] # truncate if debugging 
        for url in urls:
            
            # actual work
            temp_fpath = download_file(url, dirpath='../data/temp/')
            pageviews = get_pageviews_subset_by_proj_and_pagetitles(
                            temp_fpath, pagetitles=pagetitles)
            pageviews.to_sql(name='pageviews',con=con, if_exists='append', index=False)
            
            # complete counts
            file_count += 1
            # log and cleanup
            log_forecast_of_completion(file_count, process_start_time, years)
            os.remove(temp_fpath)

---

run main program

In [None]:
etl_pageviews_by_years_and_projects([2016,2017,2018,2019,2020,2021], all_titles)

---

In [None]:
pd.read_sql('select count(*) from pageviews;', con=conn)

In [None]:
pd.read_sql('desc pageviews;', con=conn)

In [None]:
pd.read_sql('show tables;', con=conn)

---

## get it working

In [None]:
datapath = '../data/temp/jawiki_pageviews/2016/2016-01/pageviews-20160102-user'

In [None]:
volc_ids = disaster_descendants['火山災害'].id.tolist()

In [None]:
def getline(n):    
    with open(datapath) as f:
        for i in range(n-1):
            line = f.readline()
        print(f.readline())

In [96]:
def countlines(path):
    ct = 0
    with open(path) as f:
        while True:
            try:
                line = f.readline()
            except:
                break
    return ct

In [None]:
getline(400243)

In [None]:
title, line

In [None]:
ct_pids, ct_diz

---

#### how to do jupyter with aws

- https://dataschool.com/data-modeling-101/running-jupyter-notebook-on-an-ec2-server/
    - except jupyter_notebook_config.py_ should be ...config.py instead
- https://gist.github.com/J535D165/0e840291e7b2598ec157e13e9b9ca569
    - trying this for how to use nohup
- some medium [article](https://medium.com/@christinakouride/a-beginners-guide-to-running-jupyter-notebook-on-amazon-ec2-69e1b74e73cc#:~:text=Your%20Jupyter%20Notebook%20server%20will,for%20time%20not%20using%20it.)
    - they pointed out that notebook will keep running, but didn't mention nohup

---

#### run this in root console
```sql
set global net_buffer_length=1000000; 
set global max_allowed_packet=1000000000;
```

---

---

---

---

---

---

---

#### run this in root console
```sql
set global net_buffer_length=1000000; 
set global max_allowed_packet=1000000000;
```

###### function ```subset_pageviews_by_page_titles``` to clean and filter pageview records

###### FOUND OUT THAT THERE'RE SOME UNRELATED COLUMNS IN THE CATEGORIES:
NEED TO CLEAN THE CATEGORIES