# find-all-category-descendents

## intro

#### plan

Make a function to get all the page_id's in a category (ideally hierarchical self join)

#### imports

In [316]:
import pandas as pd, numpy as np, os, re, pyperclip, pickle
import mysql.connector as mysql, sqlalchemy
from datetime import datetime as dt

#### connect to mariadb

##### pass

In [4]:
mysql_user = 'bhrdwj'
# mysql_pass = input(f'Enter the MySQL password for user {mysql_user}: ')

##### connect

In [5]:
def connect_mariadb():
    host='localhost'; user=mysql_user; passwd=mysql_pass; dbname='jawiki';
    cxn = mysql.connect(host=host,user=user,passwd=passwd, database=dbname)
    cur = cxn.cursor()

    connection_str = 'mysql+mysqlconnector://'+user+':'+passwd+'@'+host+'/'+dbname  # removed this after host +':'+dbport
    try:
        engine = sqlalchemy.create_engine(connection_str)
        conn = engine.connect()
    except Exception as e:
        print('Database connection error - check creds')
        print(e)
    return cxn, cur, engine, conn
        
cxn, cur, engine, conn = connect_mariadb()

##### test

In [7]:
metadata = sqlalchemy.MetaData(conn)
metadata.reflect()
current_tables = list(metadata.tables.keys())
current_tables

['category',
 'categorylinks',
 'h2013',
 'h2014',
 'h2015',
 'h2016',
 'h2017',
 'h2018',
 'h2019',
 'h2020',
 'h2021',
 'history',
 'page',
 't2013',
 't2014',
 't2015',
 't2016',
 't2017',
 't2018',
 't2019',
 't2020',
 't2021',
 'warn_rows']

#### some paths

In [8]:
rawdumps_path = '../data/raw/jawiki/dumps_unzipped/'
processed_path = '../data/processed/jawiki/'

#### function ```jpr_sql```

In [9]:
def jpr_sql(query, conn=conn):
    try: 
        a = conn.execute(query)
    except Exception as e:
        print('Printing execute error:')
        print(e)
        return None
    
    try:
        return a.fetchall()
    except Exception as e:
        print('Printing results error:')
        print(e)
        print('Printing result')
        return None

#### function ```byte_read_sql```

In [226]:
def decode_if_it_can(value):
    """
    Tries to decode a possible bytes object with utf-8. 
    If fails, silently returns input.
    INPUTS: object
    OUTPUTS: input or decoded string
    """
    try:
        value = value.decode('utf-8')
    except (UnicodeDecodeError, AttributeError):
        pass
    finally:
        return value

def byte_read_sql(query, conn):
    """
    Simple wrapper for pd.read_sql() for mediawiki dumps including byte-like data.
    Inputs: 
            SQL query string
            SQLAlchemy connection
    Outputs:
            Pandas dataframe with some values modified, that is:
                values that can be decoded with .decode('utf-8') are thus decoded.
    """
    df = pd.read_sql(query, conn)
    return (df.applymap(lambda x: decode_if_it_can(x)))

## setup recursive self join to get the subcategories

### EDA ```categorylinks``` and ```page```

##### head

In [67]:
head = byte_read_sql('select * from categorylinks limit 10;',conn)

In [74]:
head.loc[0]

cl_from                                5
cl_to                            ラテン語の語句
cl_sortkey                あんはさんと\nアンパサンド
cl_timestamp         2018-05-13 07:37:48
cl_sortkey_prefix                 あんはさんと
cl_collation                   uppercase
cl_type                             page
Name: 0, dtype: object

##### describe categorylinks

In [69]:
categorylinks_schema = byte_read_sql('desc categorylinks', conn)

- **cl_from**: this page ```page.page_id```
- **cl_to**: parent category ```page_title```
- **cl_sortkey**: title by which the page should be sorted in a category list 
    - (not valid UTF-8 whenever the database truncates the sortkey in the middle of a multi-byte sequence.)
- cl_timestamp: time at which that link was last updated in the table.
- cl_sortkey_prefix: human readable version of cl_sortkey
- cl_collation: What collation is in use. (not content)
- **cl_type**: type of page (file, subcat (subcategory) or page (normal page))


In [70]:
categorylinks_schema

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,cl_from,int(8) unsigned,NO,PRI,0,
1,cl_to,varbinary(255),NO,PRI,,
2,cl_sortkey,varbinary(230),NO,,,
3,cl_timestamp,timestamp,NO,,current_timestamp(),on update current_timestamp()
4,cl_sortkey_prefix,varbinary(255),NO,,,
5,cl_collation,varbinary(32),NO,MUL,,
6,cl_type,"enum('page','subcat','file')",NO,,page,


##### page table schema

In [78]:
byte_read_sql("desc page;", conn)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,page_id,int(8) unsigned,NO,PRI,,auto_increment
1,page_namespace,int(11),NO,MUL,0.0,
2,page_title,varbinary(255),NO,,,
3,page_restrictions,varbinary(255),YES,,,
4,page_is_redirect,tinyint(1) unsigned,NO,MUL,0.0,
5,page_is_new,tinyint(1) unsigned,NO,,0.0,
6,page_random,double unsigned,NO,MUL,0.0,
7,page_touched,varbinary(14),NO,,,
8,page_links_updated,varbinary(14),YES,,,
9,page_latest,int(8) unsigned,NO,,0.0,


- Table ```page``` schema:  
    - **page_id**: 
    - **page_namespace**
    - **page_title**

##### page namespaces codebook

### prep for self-join

#### create table cats_to_selfjoin

- join categorylinks with page to denormalize and include ```parent_id```
- also, filter to include only content pages and categories

#### alter table cats_to_selfjoin add primary key

### get all descendents of a category

#### peek at ```cats_to_selfjoin```

##### describe

In [168]:
pd.read_sql('desc cats_to_selfjoin;',conn)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,id,int(8) unsigned,NO,,0,
1,name,varbinary(230),NO,,,
2,type,"enum('page','subcat','file')",NO,,page,
3,namespace,int(11),NO,,0,
4,parent_cat_id,int(8) unsigned,NO,,0,
5,parent_cat_name,varbinary(255),NO,,,


##### get immediate children

In [193]:
byte_read_sql("select * from cats_to_selfjoin where parent_cat_id = 626482;", conn)

Unnamed: 0,id,name,type,namespace,parent_cat_id,parent_cat_name
0,10245,こりおりのちから\nコリオリの力,page,0,626482,熱帯低気圧
1,15090,たいふう\n台風,page,0,626482,熱帯低気圧
2,26863,さいくろん\nサイクロン,page,0,626482,熱帯低気圧
3,31168,*\n熱帯低気圧,page,0,626482,熱帯低気圧
4,36650,せきらんうん\n積乱雲,page,0,626482,熱帯低気圧
5,94451,はりけん\nハリケーン,page,0,626482,熱帯低気圧
6,147548,ういりういり\nウィリー・ウィリー,page,0,626482,熱帯低気圧
7,159919,たいふう\n台風,subcat,14,626482,熱帯低気圧
8,527125,ふしわらのこうか\n藤原の効果,page,0,626482,熱帯低気圧
9,626516,はりけん\nハリケーン,subcat,14,626482,熱帯低気圧


#### get all descendants

In [221]:
sql = """
with recursive cte (id, name, type, namespace, parent_cat_id) as (
  select     id,
             name,
             type,
             namespace,
             parent_cat_id
  from       cats_to_selfjoin
  where      parent_cat_id = 626482
  union all
  select     p.id,
             p.name,
             p.type,
             p.namespace,
             p.parent_cat_id
  from       cats_to_selfjoin p
  inner join cte
          on p.parent_cat_id = cte.id
)
select * from cte
"""

tropstorm = byte_read_sql(sql, conn)

## Get pageids_by_category for various natural disasters

### create dictionary of disaster category names and page_id's

In [337]:
disaster_cat_page_ids = {'火山災害':2390743, '熱帯低気圧':626482, '雪害':2390774, '地震':135264, '津波':765772}  # '自然災害':137069, 
disasters_english = {'火山災害':'VolcanicDisaster', '熱帯低気圧':'TropicalCyclones', '雪害':'SnowDamage', '地震':'Earthquake', '津波':'Tsunami'}
disasters = list(disaster_cat_page_ids.keys())

### pick page_id lists

#### function get_all_descendants_by_namespace

##### function docstring

```python
get_all_descendants_by_namespace(cat_page_id:int, namespaces: list[int] = []) -> pd.DataFrame:
```

- Get all descendant records from cats_to_selfjoin, filtered by namespace.  
- Inputs:  
    - cat_page_id: parent-category to get descendants of
    - namespaces: list of namespaces (ints) to select
        - 0: pages
        - 14: category-pages


##### function code

In [284]:
def get_all_descendants_by_namespace(cat_page_id:int, namespaces: list[int] = []) -> pd.DataFrame:
    """
    Get all descendant records from cats_to_selfjoin, filtered by namespace.
    Inputs:
        cat_page_id: parent-category to get descendants of
        namespaces: list of namespaces (ints) to select
            0: pages
            14: category-pages
    """
    
    
    if namespaces:
        namespace_condition = r"""
        where namespace in(""" + ", ".join(map(str,namespaces)) + r""")
        """
    else:
        namespace_condition = r" "
    
    sql = r"""
    with recursive cte (id, name, type, namespace, parent_cat_id) as (
      select     id,
                 name,
                 type,
                 namespace,
                 parent_cat_id
      from       cats_to_selfjoin
      where      parent_cat_id = """+str(cat_page_id)+r"""
      union all
      select     p.id,
                 p.name,
                 p.type,
                 p.namespace,
                 p.parent_cat_id
      from       cats_to_selfjoin p
      inner join cte
              on p.parent_cat_id = cte.id
    )
    select distinct id, name, type, namespace from cte
    """ + "".join(namespace_condition) +"""
    ;
    """
    return byte_read_sql(sql, conn)

#### get all descendants for various disasters as ```dict_disaster_descendants```

##### prep loop

In [343]:
cxn, cur, engine, conn = connect_mariadb()

In [342]:
disaster_descendants = {}

In [345]:
d = disaster_cat_page_ids
gen = (i for i in disaster_cat_page_ids.keys())

##### **loop**

In [366]:
disaster = next(gen)
disaster, disasters_english[disaster]

('津波', 'Tsunami')

In [367]:
start_time = dt.now()

disaster_descendants[disaster] = get_all_descendants_by_namespace(d[disaster])

end_time = dt.now()
(end_time - start_time).total_seconds()

277.51228

##### peek between loops

In [368]:
len(disaster_descendants)

5

In [369]:
disaster_descendants[disaster]

Unnamed: 0,id,name,type,namespace
0,18508,*\n津波,page,0
1,73131,くらかたう\nクラカタウ,page,0
2,79371,しまはらたいへんひこめいわく\n島原大変肥後迷惑,page,0
3,140998,いなむらのひ\n稲むらの火,page,0
4,243079,りすほんししん 1755\nリスボン地震 (1755年),page,0
...,...,...,...,...
14661,2375657,おおるすたあけえむ2011にほんふろやきゆう\n2011年のオールスターゲーム (日本プロ野球),page,0
14662,2601106,おおるすたあけえむ2012にほんふろやきゆう\n2012年のオールスターゲーム (日本プロ野球),page,0
14663,2647006,につほんしりいす2012\n2012年の日本シリーズ,page,0
14664,2664607,さむらいしやはんまつち2012にほんたいひようVSきゆうはたいひよう\n侍ジャパンマッチ20...,page,0


##### pickle

# end