# get pageviews with AWS

In [4]:
mysql_user = 'ubuntu'
# mysql_pass = input(f'Enter the MySQL password for user {mysql_user}: ')

In [253]:
todays_date = dt.datetime.now(tz=dt.timezone(dt.timedelta(hours=8))).strftime('%Y-%m-%d')
logfilepath = '../data/logs/pageviews-log_' + todays_date + '_' + str(0) + '.txt'

#### how to do jupyter with aws

- https://dataschool.com/data-modeling-101/running-jupyter-notebook-on-an-ec2-server/
    - except jupyter_notebook_config.py_ should be ...config.py instead
- https://gist.github.com/J535D165/0e840291e7b2598ec157e13e9b9ca569
    - trying this for how to use nohup
- some medium [article](https://medium.com/@christinakouride/a-beginners-guide-to-running-jupyter-notebook-on-amazon-ec2-69e1b74e73cc#:~:text=Your%20Jupyter%20Notebook%20server%20will,for%20time%20not%20using%20it.)
    - they pointed out that notebook will keep running, but didn't mention nohup

In [318]:
import os, requests, gzip, pickle, io, logging, inspect, functools
import pandas as pd, datetime as dt
import mysql.connector as mysql, sqlalchemy

In [2]:
from platform import python_version
print(python_version())

3.9.7


## setup logging

In [287]:
logger = logging.getLogger('1.22-sfb-get-pageviews-with-AWS')
logger.setLevel(logging.DEBUG)
# create file handler which logs even debug messages
fh = logging.FileHandler(logfilepath)
fh.setLevel(logging.DEBUG)
# create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.ERROR)
# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(fh)
logger.addHandler(ch)

# USAGE:
# logger.info('foobarbazquxquuxquuzcorgegraultgarplywaldofredplughxyzzythud')

# https://docs.python.org/3/howto/logging-cookbook.html

In [319]:
def log_args(func):
    """
    Decorator to print function call details.
    This includes parameters names and effective values.
    """
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        func_args = inspect.signature(func).bind(*args, **kwargs).arguments
        func_args_str = ", ".join(map("{0[0]} = {0[1]!r}".format, func_args.items()))
        logger.info(f"start {func.__module__}.{func.__qualname__} ( {func_args_str} )")
        try:
            return func(*args, **kwargs)
        finally:
            logger.info(f"finish {func.__module__}.{func.__qualname__} ( {func_args_str} )\n")
    return wrapper

In [327]:
def log_simply(func):
    """
    Decorator to print function call details.
    This includes parameters names and effective values.
    """
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        func_args = inspect.signature(func).bind(*args, **kwargs).arguments
        func_args_str = ", ".join(map("{0[0]} = {0[1]!r}".format, func_args.items()))
        logger.info(f"start {func.__module__}.{func.__qualname__}")
        try:
            return func(*args, **kwargs)
        finally:
            logger.info(f"finish {func.__module__}.{func.__qualname__}\n")
    return wrapper

##### login to mariadb

In [330]:
@log_simply
def connect_mariadb(host='localhost', user=mysql_user, passwd=mysql_pass, dbname='jawiki'):
    """
    connect to mariadb and return: 
        cxn, cur, engine, conn
    """
    cxn = mysql.connect(host=host,user=user,passwd=passwd, database=dbname)
    cur = cxn.cursor()

    connection_str = 'mysql+mysqlconnector://'+user+':'+passwd+'@'+host+'/'+dbname  # removed this after host +':'+dbport
    try:
        engine = sqlalchemy.create_engine(connection_str)
        conn = engine.connect()
    except Exception as e:
        print('Database connection error - check creds')
        print(e)
    return cxn, cur, engine, conn

cxn, cur, engine, conn = connect_mariadb()

## get page_titles and urls

## functions

###### function get_pageviews_urls_and_outpaths_by_years

In [344]:
@log_args
def get_pageviews_urls_by_year(year:int) -> list[str]:
    """
    Programmatically generate the urls and local file structure to get pageviews hourfiles.
    INPUT: year as integer
    OUTPUT: list of urls to all the hourfiles for that year
    NOTE: filenames are formatted as: 
        'pageviews-20210101-000000.gz', i.e.
        'pageviews-yyyymmdd-hhmmss.gz', although 'mmss' is always '0000'    
    """
    hour_strings = list(map(lambda x: str(x).zfill(2), range(0,24)))  # '00' through '23' 
    dates = pd.date_range(dt.datetime(year,1,1), 
                          end=dt.datetime(year+1,1,1), 
                          freq='D')  # , tz='Japan'                   # each date in the year
    base_url = 'https://dumps.wikimedia.org/other/pageviews/'
    urls = [f'{base_url}{d.year}/{d.year}-{str(d.month).zfill(2)}'
            f'/pageviews-{d.year}{str(d.month).zfill(2)}{str(d.day).zfill(2)}-{h}0000.gz'
            for d in dates for h in hour_strings]
    return urls

##### function download_file to temporarily store hourfiles

In [321]:
@log_args
def download_file(url, dirpath='./'):
    """
    Carefully downloads the file from the url to the local directory dirpath.
    Should work for large files, and hopefully for poor connections, etc.
    """
    local_filepath = dirpath + url.split('/')[-1]
    # NOTE the stream=True parameter below
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filepath, 'wb') as f:
            for chunk in r.iter_content(chunk_size=131072):   #8192
                # If you have chunk encoded response uncomment if
                # and set chunk_size parameter to None.
                #if chunk: 
                f.write(chunk)
    return local_filepath
# https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests

###### function get_pageviews_subset_by_project to get all jawiki pageviews from an hourfile

In [322]:
@log_args
def open_hour_file(path:str):
    """
    INPUT: path to a .gz file 
    OUTPUT: open file handle
    USAGE:
        with open_hour_file(foo) as bar:
            baz
    BEHAVIOR: if filename doesn't have .gz extension, tries to open without decompression
    
    """
    if path[-3:] == ".gz":
        return gzip.open(path, mode="rt", encoding="utf-8", errors="replace")
    else:
        return open(path, mode="rt", encoding="utf-8", errors="replace")
# https://github.com/mediawiki-utilities/python-mwviews/blob/main/src/mwviews/utilities/aggregate.py
# https://stackoverflow.com/questions/30582162/creating-a-missing-directory-file-structure-python

In [323]:
@log_args
def parse_utc_time_from_filename(filename:str) -> dt.datetime:
    """
    filenames are formatted as: 
        'pageviews-20210101-000000.gz', i.e.
        'pageviews-yyyymmdd-hhmmss.gz', although 'mmss' is always '0000'    
    """
    [d, t] = filename.split('-')[1:]  # split filename into yyyymmdd and hhmmss
    return dt.datetime(int(d[:4]), int(d[4:6]), int(d[6:]), int(t[:2]), tzinfo=dt.timezone.utc)

In [324]:
@log_args
def get_pageviews_subset_by_project(fpath_in, proj:str='ja') -> pd.DataFrame:
    """
    INPUT:
        fpath: local filepath to pageviews records as text file
        proj: mediawiki project code
    OUTPUT:
        dataframe ready for database ingestion
    """
    lst = []
    colnames=['domain_code','page_title','count_views','total_response_size']
    # subset pageview records by project as list of strings
    with open_hour_file(fpath_in) as f_in:
        while (line := f_in.readline()):
            if line[:3] != proj + ' ':
                continue
            lst.append(line)
    # turn list of strings into dataframe
    df = pd.read_csv(
        io.StringIO('\n'.join(lst))
        ,delim_whitespace=True
        ,names=colnames)
    # add datetime_viewed_UTC as column based on filename
    parsed_datetime = parse_utc_time_from_filename(fpath_in.split(sep='/')[-1])
    df = df.assign(datetime_viewed_UTC=parsed_datetime)
    # add datetime_added_UTC as the time when this function runs
    now_datetime = dt.datetime.now(tz=dt.timezone.utc)
    df = df.assign(datetime_added_UTC=now_datetime)
    return df
    

###### function etl_pageviews_by_years_and_projects

In [325]:
@log_args
def does_table_exist(tablename:str, dbname:str='jawiki', con=conn) -> bool:
    """
    run SQL query to look for tablename, return boolean
    """
    sql = f"""
    SELECT * 
        FROM information_schema.tables
    WHERE table_schema = '{dbname}' 
        AND table_name = '{tablename}'
    LIMIT 1;
    """
    return bool(pd.read_sql(sql, con).shape[0])

In [326]:
@log_args
def create_pageviews_table(con=conn):
    """
    run SQL code to create pageviews table
    """
    
    sql = """
    CREATE TABLE pageviews (
        row_id BIGINT(20) AUTO_INCREMENT PRIMARY KEY
        ,domain_code TEXT
        ,page_title TEXT
        ,count_views BIGINT(20)
        ,total_response_size BIGINT(20)
        ,datetime_viewed_UTC TIMESTAMP DEFAULT 0
        ,datetime_added_UTC TIMESTAMP DEFAULT CURRENT_TIMESTAMP 
    )
    ;
    """
    conn.execute(sql)

###### calculate finish time

In [363]:
def log_forecast_of_completion(count, process_start_time, years):
    hours_in_year = 365.25*24
    num_years = len(years)
    frac_done = count / hours_in_year / num_years
    time_now = dt.datetime.now()
    process_time_so_far = time_now - process_start_time
    finish_time = process_start_time + (process_time_so_far / frac_done)
    logger.info(
        f'-----------\n'
        f'PROCESS TIME SO FAR:  {time_now - process_start_time}\n'
        f'FRACTION DONE:        {frac_done}\n'
        f'EXPECTED FINISH TIME: {finish_time}\n'
        '-----------\n'
    )

## main program

In [364]:
@log_args
def etl_pageviews_by_years_and_projects(years:list[int], project:str='ja', logfilepath:str=logfilepath, con=conn):
    if not does_table_exist('pageviews'):
        create_pageviews_table(con)
    process_start_time = dt.datetime.now()
    for year in years:
        urls = get_pageviews_urls_by_year(year)
        urls = urls[:3]   ############################## TRUNCATED FOR DEBUGGING
        for url in urls:
            # start counts
            file_count = 0
            
            # actual work
            temp_fpath = download_file(url, dirpath='../data/temp/')
            pageviews = get_pageviews_subset_by_project(temp_fpath)
            pageviews.to_sql(name='pageviews',con=con, if_exists='append', index=False)
            
            # complete counts
            file_count += 1
            # log and cleanup
            log_forecast_of_completion(file_count, process_start_time, years)
            os.remove(temp_fpath)

In [365]:
etl_pageviews_by_years_and_projects([2016])

In [366]:
pd.read_sql('select count(*) from pageviews;', con=conn)

Unnamed: 0,count(*)
0,583783


In [368]:
pd.read_sql('desc pageviews;', con=conn)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,row_id,bigint(20),NO,PRI,,auto_increment
1,domain_code,text,YES,,,
2,page_title,text,YES,,,
3,count_views,bigint(20),YES,,,
4,total_response_size,bigint(20),YES,,,
5,datetime_viewed_UTC,timestamp,NO,,0000-00-00 00:00:00,
6,datetime_added_UTC,timestamp,NO,,current_timestamp(),


In [369]:
pd.read_sql('select * from pageviews order by datetime_viewed_UTC desc limit 5;', con=conn, index_col='row_id')

Unnamed: 0_level_0,domain_code,page_title,count_views,total_response_size,datetime_viewed_UTC,datetime_added_UTC
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
458752,ja,„ÇÑ„Åæ„Å™„Åø„Éà„É≥„Éç„É´,1,0,2016-01-01 02:00:00,2022-03-20 12:02:33
524288,ja,Âú∞„ÅÆÂ°©„ÄÅ‰∏ñ„ÅÆÂÖâ,1,0,2016-01-01 02:00:00,2022-03-20 12:02:33
459008,ja,„Çé,1,0,2016-01-01 02:00:00,2022-03-20 12:02:33
524544,ja,ÂùÇ‰∫ï‰∏âÈÉé,3,0,2016-01-01 02:00:00,2022-03-20 12:02:33
459264,ja,„Ç¢„Ç§„É™„Çπ„Ç™„Éº„É§„Éû,3,0,2016-01-01 02:00:00,2022-03-20 12:02:33


In [371]:
# nohup jupyter notebook &

In [372]:
# !jupyter nbconvert --to script '1.22-sfb-get-pageviews-with-AWS-lambda.ipynb'

##### test running cell while laptop sleeping

###### idea

- consider the case where:
    - i start the following cell's script as follows:
        - on AWS from a ssh terminal 
        - via nohup jupyter on computer A
    - then i disconnect the ssh from computer A
        - so that the cell is still running in the instance
- then i find the result that:
    - i can reopen jupyter from another ssh terminal
    - i can interrupt the script
    - but i can't see its in-process results
- strategy for now:
    - develop the script on whatever computers
    - then run it from a browser tab in my android phone
        - that way i can keep "tabs" on it (üòâ)

###### function ```subset_pageviews_by_page_titles``` to clean and filter pageview records

###### function ```subset_pageviews_by_project``` to clean and filter pageview records

###### unpickle page list

###### FOUND OUT THAT THERE'RE SOME UNRELATED COLUMNS IN THE CATEGORIES:
NEED TO CLEAN THE CATEGORIES