# Data: explore: API➔csv

### Headings and Library Imports

In [1]:
#!/usr/bin/python3
"""
    get_recent_changes.py

    MediaWiki API Demos
    Demo of `RecentChanges` module: Get the three most recent changes with
    sizes and flags

    MIT License
""";

In [2]:
import requests
import pandas as pd
import os
import datetime
import dateutil.parser as dp
import json
import time
from sseclient import SSEClient as EventSource

os.chdir('/home/bhrdwj/git/predwikt/notebooks/')

### Get Data

In [3]:
import requests
sesh = requests.Session()

In [4]:
def get_rc(rc_list:list, params:dict, url: str, sesh) -> str:
    '''
    Inputs:  rc_list: list to be populated with recentchanges jsons
             params: dictionary of parameters for the API request
                 this fn expects at least these parameters:
                    'rcprop' : 'timestamp|ids', (more is okay)
                    'action' : 'query',
                    'rcdir'  : 'newer',
                    'format' : 'json',
                    'list'   : 'recentchanges',
             url: API url (designates which wiki)
             sesh: requests session
    Outputs: timestamp of latest  
    '''
    raw_output= sesh.get(url=url, params=params)
    json_data = raw_output.json()
    recent_changes = json_data['query']['recentchanges']
    rc_list.append(recent_changes)
    timestamps = [rc['timestamp'] for rc in recent_changes]
    timestamps = sorted(map(dp.isoparse, timestamps))
    ts = timestamps[-3]
    return ts.strftime('%Y-%m-%dT%H:%M:%SZ')


In [5]:
rc_list=[]
url = 'https://en.wikipedia.org/w/api.php'
params = {
    'rcstart'     : '2021-10-20T00:30:01Z',
    'rcnamespace' : '0',
    'rcshow'      : '!bot',
    'rclimit'     : '50',
    
    'rcprop': 'user|userid|timestamp|title|ids|sizes',
    
    'action'      : 'query',
    'rcdir'       : 'newer',
    'format'      : 'json',
    'list'        : 'recentchanges',
}

# Dictionary keys that output from these parameters:
['timestamp', 'type', 'title', 'anon', 'rcid', 'ns', 'revid', 'pageid', 'user', 'userid', 'oldlen', 'old_revid', 'newlen'];

In [6]:
for i in range(100):
    latest_timestamp = get_rc(rc_list, params, url, sesh)
    params['rcstart'] = latest_timestamp
    print(f'{i} {latest_timestamp}')
    time.sleep(.5)

0 2021-10-20T00:30:39Z
1 2021-10-20T00:31:23Z
2 2021-10-20T00:32:01Z
3 2021-10-20T00:32:38Z
4 2021-10-20T00:33:17Z
5 2021-10-20T00:33:54Z
6 2021-10-20T00:34:31Z
7 2021-10-20T00:35:16Z
8 2021-10-20T00:35:55Z
9 2021-10-20T00:36:34Z
10 2021-10-20T00:37:15Z
11 2021-10-20T00:37:58Z
12 2021-10-20T00:38:39Z
13 2021-10-20T00:39:21Z
14 2021-10-20T00:40:00Z
15 2021-10-20T00:40:37Z
16 2021-10-20T00:41:13Z
17 2021-10-20T00:41:58Z
18 2021-10-20T00:42:46Z
19 2021-10-20T00:43:28Z
20 2021-10-20T00:44:20Z
21 2021-10-20T00:44:58Z
22 2021-10-20T00:45:33Z
23 2021-10-20T00:46:15Z
24 2021-10-20T00:46:57Z
25 2021-10-20T00:47:37Z
26 2021-10-20T00:48:15Z
27 2021-10-20T00:48:49Z
28 2021-10-20T00:49:33Z
29 2021-10-20T00:50:21Z
30 2021-10-20T00:51:00Z
31 2021-10-20T00:51:38Z
32 2021-10-20T00:52:24Z
33 2021-10-20T00:53:05Z
34 2021-10-20T00:53:50Z
35 2021-10-20T00:54:36Z
36 2021-10-20T00:55:15Z
37 2021-10-20T00:56:08Z
38 2021-10-20T00:56:56Z
39 2021-10-20T00:57:32Z
40 2021-10-20T00:58:14Z
41 2021-10-20T00:58:51Z
42

In [7]:
# flatten the jsons
all_jsons = [item for sublist in rc_list for item in sublist]
# remove jsons with duplicate rcid's
all_rcids = {j['rcid']:i for i,j in enumerate(all_jsons)}
unique_jsons = [all_jsons[i] for i in all_rcids.values()]

In [8]:
len(unique_jsons)

4639

In [9]:
latest_timestamp

'2021-10-20T01:39:40Z'

In [None]:
print(list(set([k for j in unique_jsons for k in list(j.keys())])))
# ['old_revid', 'timestamp', 'ns', 'oldlen', 'rcid', 'anon', 'revid', 'newlen', 'pageid', 'user', 'title', 'userid', 'type']

In [13]:
df = pd.DataFrame.from_records(unique_jsons);

In [14]:
df.to_csv('../data/interim/2021-10-20T00:30:01Z_2021-10-20T01:39:40Z.csv')

### SQL Goal: Get the schema such that:

- such that:
    - all data from this sample "l"
    - will fit into the database

### Geoff's notebook with SQL in it

In [None]:
# Define Schema
sql_create_table = """
DROP TABLE IF EXISTS data_raw;

CREATE TABLE data_raw(
    row_index SERIAL,
    time_string char varying(25),
    unix_time bigint,
    instance char varying(35),
    product char varying(5),
    username char varying(35),
    event char varying(100),
    attributes text
);
"""

In [None]:
# connect to database
try:
    conn = psycopg2.connect("host="+dbhost+" dbname="+dbname+" user="+dbuname+" password="+dbpassword)
    cur = conn.cursor()
except:
    print('Database connection error - check creds')

In [None]:
%%time
# Create table and import data
cur.execute(sql_create_table)
sql_import = "COPY data_raw(time_string,unix_time,instance,product,username,event,attributes) FROM STDIN DELIMITER E'\t';"
cur.copy_expert(sql_import, open('jira_clean.tsv', "r",encoding="utf8"))
conn.commit()

In [None]:
# update table to have date 
sql_calc_table = """
DROP TABLE IF EXISTS data_prep;

CREATE TABLE data_prep as (
    select 
        row_index
        , to_timestamp(time_string,'YYYY-MM-DD HH24:MI,MS')::timestamp without time zone as time_parsed
        , unix_time
        , instance
        , username
        , event
        , attributes
    from
        data_raw
);
"""
cur.execute(sql_calc_table)
conn.commit()

In [None]:
# Finally, let's instantiate a SQL alchemy engine, so we can pass results sets into pandas and evaluate them here 
connection_str = 'postgresql+psycopg2://'+dbuname+':'+dbpassword+'@'+dbhost+':'+dbport+'/'+dbname
try:
    engine1 = sqlalchemy.create_engine(connection_str)
    conn1 = engine1.connect()
except:
    print('Database connection error - check creds')
engine1.table_names() # Confirm connection and tables are present as expect

### More detailed requests from wikipedia

In [None]:
import requests
requests_session = requests.Session()
url = 'https://en.wikipedia.org/w/api.php'
params = {
    'rcstart'     : '2021-10-20T00:30:01Z',
    'rcdir'       : 'newer',
    'rcnamespace' : '0',
    'format'      : 'json',
    'rcprop': 'user|userid|comment|flags|timestamp|title|ids|sizes|redirect|tags|loginfo',
    'list'        : 'recentchanges',
    'action'      : 'query',
    'rclimit'     : '50',
    'rcshow'      : '!bot'
}

In [None]:
fields = ['anon', 'comment', 'logaction', 'logid', 'logparams', 
          'logtype', 'minor', 'new', 'newlen', 'ns', 
          'old_revid', 'oldlen', 'pageid', 'rcid', 'redirect', 
          'revid', 'tags', 'tagstags', 'timestamp', 'title', 
          'type', 'user', 'userid']
len(fields);

In [None]:
# List of wikipedias is used to filter wikipedia edits out from other wiki projects
wkps = pd.read_csv('../data/external/wikipedias.csv').assign(code=lambda x: x.abbrev + 'wiki')
wkps.columns

###

### Checking out what's in the json... (method deeply flawed lol)

In [None]:
for i in l:
    for j in i:
        if hasattr(j, "__len__") and not isinstance(j, str):
            print(j) 

In [None]:
set_keys = set()
for i in recent_changes:
    set_keys.update(list(i.keys()))
set_keys;

In [None]:
set_keys.update(fields)
print(sorted(list(set_keys)))
len(set_keys)