# Data: API➔Database

Source file headings

In [6]:
#!/usr/bin/python3
"""
    get_recent_changes.py

    MediaWiki API Demos
    Demo of `RecentChanges` module: Get the three most recent changes with
    sizes and flags

    MIT License
""";

In [8]:
import requests
import pandas as pd
import os
import datetime
import json
from sseclient import SSEClient as EventSource

os.chdir('/home/bhrdwj/git/predwikt/notebooks/')

In [100]:
fields = (
    ['logid', 'logaction', 'minor', 'userid', 'user', 
     'logparams', 'rcid', 'pageid', 'old_revid', 'new', 
     'newlen', 'comment', 'ns', 'timestamp', 'tagstags', 
     'type', 'title', 'revid', 'logtype', 'oldlen', 
     'anon']
)
# print(set(fields))

In [87]:
import requests

S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"

PARAMS = {
    "rcstart" :   '2021-11-01T00:10:01Z',
    "rcend"   :   '2021-11-01T00:20:01Z',
    "rcdir"   :   'newer',
    "format": "json",
    "rcprop": "user|userid|comment|flags|timestamp|title|ids|sizes|redirect|tags|loginfo",
    "list": "recentchanges",
    "action": "query",
    "rclimit": "50",
    "rcshow": "!bot"
}

In [49]:
# List of wikipedias is used to filter wikipedia edits out from other wiki projects
wkps = pd.read_csv('../data/external/wikipedias.csv').assign(code=lambda x: x.abbrev + 'wiki')
wkps.columns

Index(['lang_EN', 'language', 'abbrev', 'code'], dtype='object')

In [88]:
# Initialize empty list
l=[]
raw_output= S.get(url=URL, params=PARAMS)
json_data = raw_output.json()
recent_changes = json_data['query']['recentchanges']

In [89]:
s = set()
for i in recent_changes:
    for j in i:
        if hasattr(j, "__len__") and not isinstance(j, str):
            print(j) 

In [90]:
s = set()
for i in recent_changes:
    s.update(list(i.keys()))
s

{'anon',
 'comment',
 'logaction',
 'logid',
 'logparams',
 'logtype',
 'minor',
 'new',
 'newlen',
 'ns',
 'old_revid',
 'oldlen',
 'pageid',
 'rcid',
 'revid',
 'tags',
 'timestamp',
 'title',
 'type',
 'user',
 'userid'}

In [94]:
s.update(fields)
print(sorted(list(s)))

['anon', 'comment', 'logaction', 'logid', 'logparams', 'logtype', 'minor', 'new', 'newlen', 'ns', 'old_revid', 'oldlen', 'pageid', 'rcid', 'revid', 'tags', 'timestamp', 'title', 'type', 'user', 'userid']


### Get the schema
- such that:
    - all data from this sample "l"
    - will fit into the database

In [None]:
# Define Schema
sql_create_table = """
DROP TABLE IF EXISTS data_raw;

CREATE TABLE data_raw(
    row_index SERIAL,
    time_string char varying(25),
    unix_time bigint,
    instance char varying(35),
    product char varying(5),
    username char varying(35),
    event char varying(100),
    attributes text
);
"""

In [None]:
# connect to database
try:
    conn = psycopg2.connect("host="+dbhost+" dbname="+dbname+" user="+dbuname+" password="+dbpassword)
    cur = conn.cursor()
except:
    print('Database connection error - check creds')

In [None]:
%%time
# Create table and import data
cur.execute(sql_create_table)
sql_import = "COPY data_raw(time_string,unix_time,instance,product,username,event,attributes) FROM STDIN DELIMITER E'\t';"
cur.copy_expert(sql_import, open('jira_clean.tsv', "r",encoding="utf8"))
conn.commit()

In [None]:
# update table to have date 
sql_calc_table = """
DROP TABLE IF EXISTS data_prep;

CREATE TABLE data_prep as (
    select 
        row_index
        , to_timestamp(time_string,'YYYY-MM-DD HH24:MI,MS')::timestamp without time zone as time_parsed
        , unix_time
        , instance
        , username
        , event
        , attributes
    from
        data_raw
);
"""
cur.execute(sql_calc_table)
conn.commit()

In [None]:
# Finally, let's instantiate a SQL alchemy engine, so we can pass results sets into pandas and evaluate them here 
connection_str = 'postgresql+psycopg2://'+dbuname+':'+dbpassword+'@'+dbhost+':'+dbport+'/'+dbname
try:
    engine1 = sqlalchemy.create_engine(connection_str)
    conn1 = engine1.connect()
except:
    print('Database connection error - check creds')
engine1.table_names() # Confirm connection and tables are present as expect

In [70]:
start_date_datetime = datetime.datetime.fromisoformat('2011-11-01T00:00:01')
print(start_date_datetime)
start_date_string = start_date_datetime.strftime('%Y-%m-%dT%H:%M:%SZ')
print(start_date_string)

2011-11-01 00:00:01
2011-11-01T00:00:01Z
