# get pageviews with AWS

In [4]:
mysql_user = 'ubuntu'
# mysql_pass = input(f'Enter the MySQL password for user {mysql_user}: ')

In [253]:
todays_date = dt.datetime.now(tz=dt.timezone(dt.timedelta(hours=8))).strftime('%Y-%m-%d')
logfilepath = '../data/logs/pageviews-log_' + todays_date + '_' + str(0) + '.txt'

#### how to do jupyter with aws

- https://dataschool.com/data-modeling-101/running-jupyter-notebook-on-an-ec2-server/
    - except jupyter_notebook_config.py_ should be ...config.py instead
- https://gist.github.com/J535D165/0e840291e7b2598ec157e13e9b9ca569
    - trying this for how to use nohup
- some medium [article](https://medium.com/@christinakouride/a-beginners-guide-to-running-jupyter-notebook-on-amazon-ec2-69e1b74e73cc#:~:text=Your%20Jupyter%20Notebook%20server%20will,for%20time%20not%20using%20it.)
    - they pointed out that notebook will keep running, but didn't mention nohup

In [278]:
import os, requests, gzip, pickle, io, logging, inspect
import pandas as pd, datetime as dt
import mysql.connector as mysql, sqlalchemy

In [2]:
from platform import python_version
print(python_version())

3.9.7


## setup logging

In [287]:
logger = logging.getLogger('1.22-sfb-get-pageviews-with-AWS')
logger.setLevel(logging.DEBUG)
# create file handler which logs even debug messages
fh = logging.FileHandler(logfilepath)
fh.setLevel(logging.DEBUG)
# create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.ERROR)
# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(fh)
logger.addHandler(ch)

# USAGE:
# logger.info('foobarbazquxquuxquuzcorgegraultgarplywaldofredplughxyzzythud')

# https://docs.python.org/3/howto/logging-cookbook.html

In [302]:
def log_args(func):
    """
    Decorator to print function call details.
    This includes parameters names and effective values.
    """
    def wrapper(*args, **kwargs):
        func_args = inspect.signature(func).bind(*args, **kwargs).arguments
        func_args_str = ", ".join(map("{0[0]} = {0[1]!r}".format, func_args.items()))
        logger.info(f"start {func.__module__}.{func.__qualname__} ( {func_args_str} )")
        try:
            return func(*args, **kwargs)
        finally:
            logger.info(f"finish {func.__module__}.{func.__qualname__} ( {func_args_str} )\n")
    return wrapper

In [303]:
def log_simply(func):
    """
    Decorator to print function call details.
    This includes parameters names and effective values.
    """
    def wrapper(*args, **kwargs):
        func_args = inspect.signature(func).bind(*args, **kwargs).arguments
        func_args_str = ", ".join(map("{0[0]} = {0[1]!r}".format, func_args.items()))
        logger.info(f"start {func.__module__}.{func.__qualname__}")
        try:
            return func(*args, **kwargs)
        finally:
            logger.info(f"finish {func.__module__}.{func.__qualname__}\n")
    return wrapper

##### login to mariadb

In [304]:
@log_simply
def connect_mariadb():
    host='localhost'; user=mysql_user; passwd=mysql_pass; dbname='jawiki';
    cxn = mysql.connect(host=host,user=user,passwd=passwd, database=dbname)
    cur = cxn.cursor()

    connection_str = 'mysql+mysqlconnector://'+user+':'+passwd+'@'+host+'/'+dbname  # removed this after host +':'+dbport
    try:
        engine = sqlalchemy.create_engine(connection_str)
        conn = engine.connect()
    except Exception as e:
        print('Database connection error - check creds')
        print(e)
    return cxn, cur, engine, conn
        
cxn, cur, engine, conn = connect_mariadb()

## get page_titles and urls

## functions

###### function get_pageviews_urls_and_outpaths_by_years

In [305]:
@log_args
def get_pageviews_urls_by_year(year:int) -> list[str]:
    """
    Programmatically generate the urls and local file structure to get pageviews hourfiles.
    INPUT: year as integer
    OUTPUT: list of urls to all the hourfiles for that year
    NOTE: filenames are formatted as: 
        'pageviews-20210101-000000.gz', i.e.
        'pageviews-yyyymmdd-hhmmss.gz', although 'mmss' is always '0000'    
    """
    hour_strings = list(map(lambda x: str(x).zfill(2), range(0,24)))  # '00' through '23' 
    dates = pd.date_range(dt.datetime(year,1,1), 
                          end=dt.datetime(year+1,1,1), 
                          freq='D')  # , tz='Japan'                   # each date in the year
    base_url = 'https://dumps.wikimedia.org/other/pageviews/'

    urls = [f'{base_url}{d.year}/{d.year}-{str(d.month).zfill(2)}'
            f'/pageviews-{d.year}{str(d.month).zfill(2)}{str(d.day).zfill(2)}-{h}0000.gz'
            for d in dates for h in hour_strings]

##### function download_file to temporarily store hourfiles

In [306]:
@log_args
def download_file(url, dirpath='./'):
    """
    Carefully downloads the file from the url to the local directory dirpath.
    Should work for large files, and hopefully for poor connections, etc.
    """
    local_filepath = dirpath + url.split('/')[-1]
    # NOTE the stream=True parameter below
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filepath, 'wb') as f:
            for chunk in r.iter_content(chunk_size=131072):   #8192
                # If you have chunk encoded response uncomment if
                # and set chunk_size parameter to None.
                #if chunk: 
                f.write(chunk)
    return local_filepath
# https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests

###### function get_pageviews_subset_by_project to get all jawiki pageviews from an hourfile

In [307]:
@log_args
def open_hour_file(path:str):
    """
    INPUT: path to a .gz file 
    OUTPUT: open file handle
    USAGE:
        with open_hour_file(foo) as bar:
            baz
    BEHAVIOR: if filename doesn't have .gz extension, tries to open without decompression
    
    """
    if path[-3:] == ".gz":
        return gzip.open(path, mode="rt", encoding="utf-8", errors="replace")
    else:
        return open(path, mode="rt", encoding="utf-8", errors="replace")
# https://github.com/mediawiki-utilities/python-mwviews/blob/main/src/mwviews/utilities/aggregate.py
# https://stackoverflow.com/questions/30582162/creating-a-missing-directory-file-structure-python

In [308]:
@log_args
def parse_utc_time_from_filename(filename:str) -> dt.datetime:
    """
    filenames are formatted as: 
        'pageviews-20210101-000000.gz', i.e.
        'pageviews-yyyymmdd-hhmmss.gz', although 'mmss' is always '0000'    
    """
    [d, t] = filename.split('-')[1:]  # split filename into yyyymmdd and hhmmss
    return dt.datetime(int(d[:4]), int(d[4:6]), int(d[6:]), int(t[:2]), tzinfo=dt.timezone.utc)

In [309]:
@log_args
def get_pageviews_subset_by_project(fpath_in, proj:str='ja') -> pd.DataFrame:
    """
    INPUT:
        fpath: local filepath to pageviews records as text file
        proj: mediawiki project code
    OUTPUT:
        dataframe ready for database ingestion
    """
    lst = []
    colnames=['domain_code','page_title','count_views','total_response_size']
    # subset pageview records by project as list of strings
    with open_hour_file(fpath_in) as f_in:
        while (line := f_in.readline()):
            if line[:3] != proj + ' ':
                continue
            lst.append(line)
    # turn list of strings into dataframe
    df = pd.read_csv(
        io.StringIO('\n'.join(lst))
        ,delim_whitespace=True
        ,names=colnames)
    # add datetime_viewed_UTC as column based on filename
    parsed_datetime = parse_utc_time_from_filename(fpath_in.split(sep='/')[-1])
    df = df.assign(datetime_viewed_UTC=parsed_datetime)
    # add datetime_added_UTC as the time when this function runs
    now_datetime = dt.datetime.now(tz=dt.timezone.utc)
    df = df.assign(datetime_added_UTC=now_datetime)
    return df
    

###### function etl_pageviews_by_years_and_projects

In [310]:
@log_args
def does_table_exist(tablename:str, dbname:str='jawiki', con=conn):
    sql = f"""
    SELECT * 
        FROM information_schema.tables
    WHERE table_schema = '{dbname}' 
        AND table_name = '{tablename}'
    LIMIT 1;
    """
    return bool(pd.read_sql(sql, con).shape[0])

In [312]:
@log_args
def create_pageviews_table(con=conn):
    sql = """
    CREATE TABLE pageviews (
        row_id BIGINT(20) AUTO_INCREMENT PRIMARY KEY
        ,domain_code TEXT
        ,page_title TEXT
        ,count_views BIGINT(20)
        ,total_response_size BIGINT(20)
        ,datetime_viewed_UTC TIMESTAMP DEFAULT 0
        ,datetime_added_UTC TIMESTAMP DEFAULT CURRENT_TIMESTAMP 
    )
    ;
    """
    conn.execute(sql)

In [None]:
def etl_pageviews_by_years_and_projects(years:list[int], project:str, logfilepath:str, conn):
    if not does_table_exist('pageviews'):
        create_pageviews_table(conn)
        
    for year in years:
        urls = get_pageviews_urls_by_year(year)
    

In [313]:
pd.read_sql('desc pageviews;', con=conn)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,row_id,bigint(20),NO,PRI,,auto_increment
1,domain_code,text,YES,,,
2,page_title,text,YES,,,
3,count_views,bigint(20),YES,,,
4,total_response_size,bigint(20),YES,,,
5,datetime_viewed_UTC,timestamp,NO,,0000-00-00 00:00:00,
6,datetime_added_UTC,timestamp,NO,,current_timestamp(),


In [116]:
download_file(urls[1], dirpath=out_dirpath)

'../data/processed/pageviews/pageviews-20210101-010000.gz'

In [176]:
filename0 = 'pageviews-20210101-000000.gz'

In [177]:
df0 = get_pageviews_subset_by_project(out_dirpath + filename0)

In [184]:
df0.to_sql(name='pageviews',con=conn, if_exists='append', index=False)

In [186]:
sql = 'select * from pageviews limit 10'
pd.read_sql(sql, con=conn, index_col='row_id')

Unnamed: 0_level_0,domain_code,page_title,count_views,total_response_size,datetime_viewed_UTC,datetime_added_UTC
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,ja,"Hello,_world.",1,0,2021-01-01,2022-03-20 08:25:56
2,ja,$百萬BABY,1,0,2021-01-01,2022-03-20 08:25:56
3,ja,'81秋・全番組総出演!激唱!!オールスター,1,0,2021-01-01,2022-03-20 08:25:56
4,ja,(ry,1,0,2021-01-01,2022-03-20 08:25:56
5,ja,+Lhaca,1,0,2021-01-01,2022-03-20 08:25:56
6,ja,-,138,0,2021-01-01,2022-03-20 08:25:56
7,ja,.260レミントン,1,0,2021-01-01,2022-03-20 08:25:56
8,ja,.300ウィンチェスター・ショート・マグナム,2,0,2021-01-01,2022-03-20 08:25:56
9,ja,.308ウィンチェスター,1,0,2021-01-01,2022-03-20 08:25:56
10,ja,.338ラプア・マグナム,1,0,2021-01-01,2022-03-20 08:25:56


In [187]:
filename1 = 'pageviews-20210101-010000.gz'

In [188]:
df1 = get_pageviews_subset_by_project(out_dirpath + filename1)

In [189]:
df1.to_sql(name='pageviews',con=conn, if_exists='append', index=False)

In [190]:
pd.read_sql('select count(*) from pageviews;', con=conn)

Unnamed: 0,count(*)
0,206956


In [192]:
pd.read_sql('select * from pageviews limit 5;', con=conn, index_col='row_id')

Unnamed: 0_level_0,domain_code,page_title,count_views,total_response_size,datetime_viewed_UTC,datetime_added_UTC
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,ja,"Hello,_world.",1,0,2021-01-01,2022-03-20 08:25:56
2,ja,$百萬BABY,1,0,2021-01-01,2022-03-20 08:25:56
3,ja,'81秋・全番組総出演!激唱!!オールスター,1,0,2021-01-01,2022-03-20 08:25:56
4,ja,(ry,1,0,2021-01-01,2022-03-20 08:25:56
5,ja,+Lhaca,1,0,2021-01-01,2022-03-20 08:25:56


## get the files

###### get many urls

In [None]:
'https://dumps.wikimedia.org/other/pageviews/2021/2021-01/pageviews-20210101-000000.gz'

In [149]:
pd.read_sql('desc pageviews;', con=conn)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,index,bigint(20),YES,MUL,,
1,domain_code,text,YES,,,
2,page_title,text,YES,,,
3,count_views,bigint(20),YES,,,
4,total_response_size,bigint(20),YES,,,
5,datetime_viewed_UTC,timestamp,YES,,,
6,datetime_added_UTC,timestamp,YES,,,


##### populate the database

###### test to_sql

In [18]:
temp_fpath = download_file(url=urls[0], dirpath=out_dirpath)

###### get a page_title-filtered hourfile from txt as a dataframe

###### get a project-filtered  hourfile as txt, then df, then mariadb table

In [33]:
subset_pageviews_by_project(temp_fpath,out_dirpath+'test2.txt')

###### pageviews as df

In [56]:
df = pd.read_csv(out_dirpath+'test2.txt', sep=' ', 
                 names=['domain_code','page_title','count_views','total_response_size'])

In [53]:
some_time = dt.datetime(2021,1,1,0,0, tzinfo=dt.timezone.utc)

In [98]:
os.listdir('../data/processed/pageviews/2021/2021-01/')

[]

##### download many files (OBSOLETE, SWITCHING TO MARIADB ON EC2)

###### peek

In [None]:
# !jupyter nbconvert --to script '1.22-sfb-get-pageviews-with-AWS-lambda.ipynb'

##### -----

###### ------

##### test running cell while laptop sleeping

###### idea

- consider the case where:
    - i start the following cell's script as follows:
        - on AWS from a ssh terminal 
        - via nohup jupyter on computer A
    - then i disconnect the ssh from computer A
        - so that the cell is still running in the instance
- then i find the result that:
    - i can reopen jupyter from another ssh terminal
    - i can interrupt the script
    - but i can't see its in-process results
- strategy for now:
    - develop the script on whatever computers
    - then run it from a browser tab in my android phone
        - that way i can keep "tabs" on it (😉)

###### test code

In [None]:
from IPython.display import clear_output
import time

for i in range(600):
    print(i)
    time.sleep(1)
    clear_output(wait=True)

##### download many files (OBSOLETE, SWITCHING TO MARIADB ON EC2)

###### function ```subset_pageviews_by_page_titles``` to clean and filter pageview records

###### function ```subset_pageviews_by_project``` to clean and filter pageview records

###### unpickle page list

###### FOUND OUT THAT THERE'RE SOME UNRELATED COLUMNS IN THE CATEGORIES:
NEED TO CLEAN THE CATEGORIES