# mvp-eda-edit-count-correlations

## intro

#### plan

- Use lags of edits in some categories to predict edits in other categories
    1. get first row of pearson's correlation matrix among:
        - edit-counts in a target category by day
        - 1d-lagged-edit-counts of many other categories
    2. present treating each category as target and the others as features
    3. do this with 2013-2015. then train multi-v linreg with 2013-2015 and predict 2016.

#### imports

In [55]:
import pandas as pd, numpy as np, os, re, pyperclip, pickle
import mysql.connector as mysql, sqlalchemy
from datetime import datetime as dt

#### connect to mariadb

##### pass

In [6]:
mysql_user = 'bhrdwj'
# mysql_pass = input(f'Enter the MySQL password for user {mysql_user}: ')

##### connect

In [69]:
def connect_mariadb():
    host='localhost'; user=mysql_user; passwd=mysql_pass; dbname='jawiki';
    cxn = mysql.connect(host=host,user=user,passwd=passwd, database=dbname)
    cur = cxn.cursor()

    connection_str = 'mysql+mysqlconnector://'+user+':'+passwd+'@'+host+'/'+dbname  # removed this after host +':'+dbport
    try:
        engine = sqlalchemy.create_engine(connection_str)
        conn = engine.connect()
    except Exception as e:
        print('Database connection error - check creds')
        print(e)
    return cxn, cur, engine, conn
        
cxn, cur, engine, conn = connect_mariadb()

##### test

In [18]:
metadata = sqlalchemy.MetaData(conn)
metadata.reflect()
current_tables = list(metadata.tables.keys())
current_tables

['category',
 'categorylinks',
 'cats_to_selfjoin',
 'h2013',
 'h2014',
 'h2015',
 'h2016',
 'h2017',
 'h2018',
 'h2019',
 'h2020',
 'h2021',
 'history',
 'page',
 't2013',
 't2014',
 't2015',
 't2016',
 't2017',
 't2018',
 't2019',
 't2020',
 't2021',
 'warn_rows']

#### some paths

In [19]:
rawdumps_path = '../data/raw/jawiki/dumps_unzipped/'
processed_path = '../data/processed/jawiki/'

#### functions to read from MySQL dumpfiles

##### function ```jpr_sql``` 

In [20]:
def jpr_sql(query, conn=conn):
    """
    Super-basic wrapper of SQLAlchemy conn.execute()
    Input:
        query string
        conn SQLAlchemy connection
    Output:
        list / result of conn.execute().fetchall()
    """
    try: 
        a = conn.execute(query)
    except Exception as e:
        print('Printing execute error:')
        print(e)
        return None
    
    try:
        return a.fetchall()
    except Exception as e:
        print('Printing results error:')
        print(e)
        print('Printing result')
        return None

##### function ```byte_read_sql```

In [74]:
def decode_if_it_can(value):
    """
    Tries to decode a possible bytes object with utf-8. 
    If fails, silently returns input.
    INPUTS: object
    OUTPUTS: input or decoded string
    """
    try:
        value = value.decode('utf-8')
    except (UnicodeDecodeError, AttributeError):
        pass
    finally:
        return value

def byte_read_sql(query, conn):
    """
    Simple wrapper for pd.read_sql() for mediawiki dumps including byte-like data.
    Inputs: 
            SQL query string
            SQLAlchemy connection
    Outputs:
            Pandas dataframe with some values modified, that is:
                values that can be decoded with .decode('utf-8') are thus decoded.
    """
    df = pd.read_sql(query, conn)
    return (df.applymap(lambda x: decode_if_it_can(x)))

#### load pickled ```disaster_descendants```

In [207]:
with open(processed_path + 'disaster_descendants.pickle', 'rb') as f:
    disaster_descendants = pickle.load(f)

## define function count_edits_in_all_descendant_pages

##### setup for count

In [208]:
cxn, cur, engine, conn = connect_mariadb()

In [241]:
disaster_cat_page_ids = {'火山災害':2390743, '熱帯低気圧':626482, '雪害':2390774, '地震':135264, '津波':765772}  # '自然災害':137069, 
disasters_english = {'火山災害':'VolcanicDisaster', '熱帯低気圧':'TropicalCyclones', '雪害':'SnowDamage', '地震':'Earthquake', '津波':'Tsunami'}
disasters = list(disaster_cat_page_ids.keys())

disaster_subcats = {}
for i in disaster_descendants:
    disaster_subcats[i] = (
        disaster_descendants[i]
            [lambda x: x.type == 'subcat']
            .set_index('name').id.drop_duplicates()
            .to_frame().reset_index().set_index('id', drop=True)
    )

In [242]:
gen = (i for i in disaster_subcats)

In [None]:
disaster_subcats[next(gen)]

StopIteration: 

In [None]:
dupes.reset_index().name.unique()

##### try this

##### create temporary table for a kind of disaster

In [None]:
byte_read_sql("""
select * 
from cats_to_selfjoin
where id in (2390743)
;
""", conn)

In [71]:
jpr_sql("""
desc cats_to_selfjoin
;
""")

[('id', 'int(8) unsigned', 'NO', 'PRI', '0', ''),
 ('name', 'varbinary(230)', 'NO', '', '', ''),
 ('type', "enum('page','subcat','file')", 'NO', '', 'page', ''),
 ('namespace', 'int(11)', 'NO', '', '0', ''),
 ('parent_cat_id', 'int(8) unsigned', 'NO', 'PRI', '0', ''),
 ('parent_cat_name', 'varbinary(255)', 'NO', '', '', '')]

##### count edits to pages

In [None]:
def count_edits_to_pages(pages:list[int], year:int, conn=conn):
        
    stringified = ", ".join(map(str,pages))
    
    _pageidssql = r"""
    select
        count(row_id) as num_edits
    from h"""+str(year)+r"""
    where
        page_id in ("""+ stringified_pageids +r""")
        and event_entity = 'revision'
        and user_is_bot_by IS NULL
        and event_user_is_bot_by IS NULL

    ;
    """
    
    return pd.read_sql(sql,conn).num_edits.sum()

In [None]:
disaster_edit_counts = {}
for i in disasters:
    disaster_edit_counts[i] = count_edits_to_pages(disaster_descendants[i].index.to_list(), 2013)
pd.Series(disaster_edit_counts)

41827

##### and this

In [None]:
pd.read_sql(
    """
    select
        count(row_id) as num_edits
    where
        page_id in (id, name, type, namespace)
        and event_entity = 'revision'
        and user_is_bot_by IS NULL
        and event_user_is_bot_by IS NULL

    from h2013
    ;
    """
    ,conn
)

In [None]:
pyperclip.copy(count_edits_in_all_descendant_pages(626482,2013))

```sql
select
    count(row_id) as num_edits
where
    page_id in (id, name, type, namespace)
    and event_entity = 'revision'
    and user_is_bot_by IS NULL
    and event_user_is_bot_by IS NULL

from h2013
;
```

In [None]:
count_edits_in_all_descendant_pages(626482,2013)

In [None]:
import inspect
print(inspect.getsource(count_edits_in_all_descendant_pages))

##### want to get here

In [None]:
# count_edits_in_each_descendant_cat
熱帯低気圧2013_descendant_edit_counts = count_edits_in_all_descendant_pages(626482,2013)

### end section

## pick subcategories to predict category

### get annual edit counts

# END