# get pageviews with AWS

In [3]:
mysql_user = 'ubuntu'
# mysql_pass = input(f'Enter the MySQL password for user {mysql_user}: ')

#### how to do jupyter with aws

- https://dataschool.com/data-modeling-101/running-jupyter-notebook-on-an-ec2-server/
    - except jupyter_notebook_config.py_ should be ...config.py instead
- https://gist.github.com/J535D165/0e840291e7b2598ec157e13e9b9ca569
    - trying this for how to use nohup
- some medium [article](https://medium.com/@christinakouride/a-beginners-guide-to-running-jupyter-notebook-on-amazon-ec2-69e1b74e73cc#:~:text=Your%20Jupyter%20Notebook%20server%20will,for%20time%20not%20using%20it.)
    - they pointed out that notebook will keep running, but didn't mention nohup

In [4]:
import os, requests, gzip, pickle, io, logging, inspect, functools
import pandas as pd, datetime as dt
import mysql.connector as mysql, sqlalchemy

In [5]:
from platform import python_version
print(python_version())

3.9.7


In [6]:
todays_date = dt.datetime.now(tz=dt.timezone(dt.timedelta(hours=8))).strftime('%Y-%m-%d')
logfilepath = '../data/logs/pageviews-log_' + todays_date + '_' + str(0) + '.txt'

## setup logging

In [7]:
logger = logging.getLogger('1.22-sfb-get-pageviews-with-AWS')
logger.setLevel(logging.DEBUG)
# create file handler which logs even debug messages
fh = logging.FileHandler(logfilepath)
fh.setLevel(logging.DEBUG)
# create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.ERROR)
# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(fh)
logger.addHandler(ch)

# USAGE:
# logger.info('foobarbazquxquuxquuzcorgegraultgarplywaldofredplughxyzzythud')

# https://docs.python.org/3/howto/logging-cookbook.html

In [8]:
def log_args(func):
    """
    Decorator to print function call details.
    This includes parameters names and effective values.
    """
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        func_args = inspect.signature(func).bind(*args, **kwargs).arguments
        func_args_str = ", ".join(map("{0[0]} = {0[1]!r}".format, func_args.items()))
        logger.info(f"start {func.__module__}.{func.__qualname__} ( {func_args_str} )")
        try:
            return func(*args, **kwargs)
        finally:
            logger.info(f"finish {func.__module__}.{func.__qualname__} ( {func_args_str} )\n")
    return wrapper

In [9]:
def log_simply(func):
    """
    Decorator to print function call details.
    This includes parameters names and effective values.
    """
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        func_args = inspect.signature(func).bind(*args, **kwargs).arguments
        func_args_str = ", ".join(map("{0[0]} = {0[1]!r}".format, func_args.items()))
        logger.info(f"start {func.__module__}.{func.__qualname__}")
        try:
            return func(*args, **kwargs)
        finally:
            logger.info(f"finish {func.__module__}.{func.__qualname__}\n")
    return wrapper

In [10]:
def log_errors(func):
    """
    A decorator that wraps the passed in function and logs 
    exceptions should one occur
    """
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except:
            # log the exception
            err = "There was an exception in  "
            err += func.__name__
            logger.exception(err)

            # re-raise the exception
            raise
    return wrapper
# https://www.blog.pythonlibrary.org/2016/06/09/python-how-to-create-an-exception-logging-decorator/

##### login to mariadb

In [11]:
@log_simply
def connect_mariadb(host='localhost', user=mysql_user, passwd=mysql_pass, dbname='jawiki'):
    """
    connect to mariadb and return: 
        cxn, cur, engine, conn
    """
    cxn = mysql.connect(host=host,user=user,passwd=passwd, database=dbname)
    cur = cxn.cursor()

    connection_str = 'mysql+mysqlconnector://'+user+':'+passwd+'@'+host+'/'+dbname  # removed this after host +':'+dbport
    try:
        engine = sqlalchemy.create_engine(connection_str)
        conn = engine.connect()
    except Exception as e:
        print('Database connection error - check creds')
        print(e)
    return cxn, cur, engine, conn

cxn, cur, engine, conn = connect_mariadb()

## get page_titles and urls

## functions

###### function get_pageviews_urls_and_outpaths_by_years

In [12]:
@log_args
def get_pageviews_urls_by_year(year:int) -> list[str]:
    """
    Programmatically generate the urls and local file structure to get pageviews hourfiles.
    INPUT: year as integer
    OUTPUT: list of urls to all the hourfiles for that year
    NOTE: filenames are formatted as: 
        'pageviews-20210101-000000.gz', i.e.
        'pageviews-yyyymmdd-hhmmss.gz', although 'mmss' is always '0000'    
    """
    hour_strings = list(map(lambda x: str(x).zfill(2), range(0,24)))  # '00' through '23' 
    dates = pd.date_range(dt.datetime(year,1,1), 
                          end=dt.datetime(year+1,1,1), 
                          freq='D')  # , tz='Japan'                   # each date in the year
    base_url = 'https://dumps.wikimedia.org/other/pageviews/'
    urls = [f'{base_url}{d.year}/{d.year}-{str(d.month).zfill(2)}'
            f'/pageviews-{d.year}{str(d.month).zfill(2)}{str(d.day).zfill(2)}-{h}0000.gz'
            for d in dates for h in hour_strings]
    return urls

##### function download_file to temporarily store hourfiles

In [13]:
@log_args
def download_file(url, dirpath='./'):
    """
    Carefully downloads the file from the url to the local directory dirpath.
    Should work for large files, and hopefully for poor connections, etc.
    """
    local_filepath = dirpath + url.split('/')[-1]
    # NOTE the stream=True parameter below
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filepath, 'wb') as f:
            for chunk in r.iter_content(chunk_size=131072):   #8192
                # If you have chunk encoded response uncomment if
                # and set chunk_size parameter to None.
                #if chunk: 
                f.write(chunk)
    return local_filepath
# https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests

In [14]:
@log_args
def fread_from_url(url):
    """
    Simply reads the file from the url as txt
    For smaller files only (100mb maybe?)
    """
    response = requests.get(target_url)
    return response.text
# https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests

###### function get_pageviews_subset_by_project to get all jawiki pageviews from an hourfile

In [15]:
@log_args
def open_hour_file(path:str):
    """
    INPUT: path to a .gz file 
    OUTPUT: open file handle
    USAGE:
        with open_hour_file(foo) as bar:
            baz
    BEHAVIOR: if filename doesn't have .gz extension, tries to open without decompression
    
    """
    if path[-3:] == ".gz":
        return gzip.open(path, mode="rt", encoding="utf-8", errors="replace")
    else:
        return open(path, mode="rt", encoding="utf-8", errors="replace")
# https://github.com/mediawiki-utilities/python-mwviews/blob/main/src/mwviews/utilities/aggregate.py
# https://stackoverflow.com/questions/30582162/creating-a-missing-directory-file-structure-python

In [16]:
@log_args
def parse_utc_time_from_filename(filename:str) -> dt.datetime:
    """
    filenames are formatted as: 
        'pageviews-20210101-000000.gz', i.e.
        'pageviews-yyyymmdd-hhmmss.gz', although 'mmss' is always '0000'    
    """
    [d, t] = filename.split('-')[1:]  # split filename into yyyymmdd and hhmmss
    return dt.datetime(int(d[:4]), int(d[4:6]), int(d[6:]), int(t[:2]), tzinfo=dt.timezone.utc)

In [17]:
@log_args
def get_pageviews_subset_by_project(fpath_in, proj:str='ja') -> pd.DataFrame:
    """
    INPUT:
        fpath: local filepath to pageviews records as text file
        proj: mediawiki project code
    OUTPUT:
        dataframe ready for database ingestion
    """
    lst = []
    colnames=['domain_code','page_title','count_views','total_response_size']
    # subset pageview records by project as list of strings
    with open_hour_file(fpath_in) as f_in:
        while (line := f_in.readline()):
            if line[:3] != proj + ' ':
                continue
            lst.append(line)
    # turn list of strings into dataframe
    df = pd.read_csv(
        io.StringIO('\n'.join(lst))
        ,delim_whitespace=True
        ,names=colnames)
    # add datetime_viewed_UTC as column based on filename
    parsed_datetime = parse_utc_time_from_filename(fpath_in.split(sep='/')[-1])
    df = df.assign(datetime_viewed_UTC=parsed_datetime)
    # add datetime_added_UTC as the time when this function runs
    now_datetime = dt.datetime.now(tz=dt.timezone.utc)
    df = df.assign(datetime_added_UTC=now_datetime)
    return df

In [18]:
@log_args
def get_pageviews_subset_by_proj_and_pagetitles(fpath_in, proj:str='ja', pagetitles:list=[]) -> pd.DataFrame:
    """
    INPUT:
        fpath: local filepath to pageviews records as text file
        proj: mediawiki project code
        pagetitles: whitelist of page_titles 
    OUTPUT:
        dataframe ready for database ingestion
    """
    lst = []
    colnames=['domain_code','page_title','count_views','total_response_size']
    # subset pageview records by project as list of strings
    with open_hour_file(fpath_in) as f_in:
        while (line := f_in.readline()):
            if line[:3] != proj + ' ':
                continue
            if line.split(sep=' ')[1] in pagetitles:
                lst.append(line)
    # turn list of strings into dataframe
    df = pd.read_csv(
        io.StringIO('\n'.join(lst))
        ,delim_whitespace=True
        ,names=colnames)
    # add datetime_viewed_UTC as column based on filename
    parsed_datetime = parse_utc_time_from_filename(fpath_in.split(sep='/')[-1])
    df = df.assign(datetime_viewed_UTC=parsed_datetime)
    # add datetime_added_UTC as the time when this function runs
    now_datetime = dt.datetime.now(tz=dt.timezone.utc)
    df = df.assign(datetime_added_UTC=now_datetime)
    return df

###### function etl_pageviews_by_years_and_projects

In [19]:
@log_args
def does_table_exist(tablename:str, dbname:str='jawiki', con=conn) -> bool:
    """
    run SQL query to look for tablename, return boolean
    """
    sql = f"""
    SELECT * 
        FROM information_schema.tables
    WHERE table_schema = '{dbname}' 
        AND table_name = '{tablename}'
    LIMIT 1;
    """
    return bool(pd.read_sql(sql, con).shape[0])

In [20]:
@log_args
def create_pageviews_table(con=conn):
    """
    run SQL code to create pageviews table
    """
    
    sql = """
    CREATE TABLE pageviews (
        row_id BIGINT(20) AUTO_INCREMENT PRIMARY KEY
        ,domain_code TEXT
        ,page_title TEXT
        ,count_views BIGINT(20)
        ,total_response_size BIGINT(20)
        ,datetime_viewed_UTC TIMESTAMP DEFAULT 0
        ,datetime_added_UTC TIMESTAMP DEFAULT CURRENT_TIMESTAMP 
    )
    ;
    """
    conn.execute(sql)

###### calculate finish time

In [21]:
def log_forecast_of_completion(count, process_start_time, years):
    hours_in_year = 365.25*24
    num_years = len(years)
    frac_done = count / hours_in_year / num_years
    time_now = dt.datetime.now()
    process_time_so_far = time_now - process_start_time
    finish_time = process_start_time + (process_time_so_far / frac_done)
    logger.info(
        f'-----------\n'
        f'PROCESS TIME SO FAR:  {time_now - process_start_time}\n'
        f'COUNT:                {count}\n'
        f'FRACTION DONE:        {frac_done}\n'
        f'EXPECTED FINISH TIME: {finish_time}\n'
        '-----------\n'
    )

###### unpickle list of pagetitles

In [22]:
def failed_decode(a):
    try:
        a.decode('utf-8')
    except UnicodeDecodeError:
        return True
    return False

def bytearray_to_str(a:bytearray, encoding='utf-8') -> str:
    if type(a) != bytearray:
        return a        
    while failed_decode(a):
        a = a[:-1]
    return str(a.decode(encoding))

In [23]:
with open('../data/processed/jawiki/' + 'disaster_descendants_raw.pickle', 'rb') as f:
    disaster_descendants_raw = pickle.load(f)

In [24]:
disaster_cat_page_ids = {'火山災害':2390743, '熱帯低気圧':626482, '雪害':2390774, '地震':135264, '津波':765772}  # '自然災害':137069, 
disasters_english = {'火山災害':'VolcanicDisaster', '熱帯低気圧':'TropicalCyclones', '雪害':'SnowDamage', '地震':'Earthquake', '津波':'Tsunami'}
disasters = list(disaster_cat_page_ids.keys())

In [25]:
d = {}
for i in disaster_descendants_raw:
    d[i] = (disaster_descendants_raw[i]
            .drop_duplicates(subset='id')
            .applymap(bytearray_to_str)
           )
    d[i] = d[i][d[i].namespace == 0]
    d[i]['page_title'] = d[i].name.map(lambda x: str(x).split(sep='\n')[-1])
disaster_descendants = d
del d

In [26]:
all_titles = [j for i in disaster_descendants for j in disaster_descendants[i].page_title]

## main program

In [28]:
@log_errors
def etl_pageviews_by_years_and_projects(years:list[int], 
                                        pagetitles:list[str],
                                        project:str='ja', 
                                        logfilepath:str=logfilepath, 
                                        con=conn):
    if not does_table_exist('pageviews'):
        create_pageviews_table(con)
    # start counts
    process_start_time = dt.datetime.now()
    file_count = 0
    for year in years:
        urls = get_pageviews_urls_by_year(year)
        # urls = urls[:3] # truncate if debugging 
        for url in urls:
            
            # actual work
            temp_fpath = download_file(url, dirpath='../data/temp/')
            pageviews = get_pageviews_subset_by_proj_and_pagetitles(
                            temp_fpath, pagetitles=pagetitles)
            pageviews.to_sql(name='pageviews',con=con, if_exists='append', index=False)
            
            # complete counts
            file_count += 1
            # log and cleanup
            log_forecast_of_completion(file_count, process_start_time, years)
            os.remove(temp_fpath)

---

run main program

In [None]:
etl_pageviews_by_years_and_projects([2016,2017,2018,2019,2020,2021], all_titles)

---

In [None]:
pd.read_sql('select count(*) from pageviews;', con=conn)

In [None]:
pd.read_sql('desc pageviews;', con=conn)

In [31]:
pd.read_sql('show tables;', con=conn)

Unnamed: 0,Tables_in_jawiki


---

```sql
set password for ubuntu@localhost = PASSWORD('mariadb394')
show grants for ubuntu@localhost;
```

#### run this in root console
```sql
set global net_buffer_length=1000000; 
set global max_allowed_packet=1000000000;
```

---

---

---

---

---

---

---

#### run this in root console
```sql
set global net_buffer_length=1000000; 
set global max_allowed_packet=1000000000;
```

###### FOUND OUT THAT THERE'RE SOME UNRELATED COLUMNS IN THE CATEGORIES:
NEED TO CLEAN THE CATEGORIES