# Try MediaWiki's RecentChanges API

## Setup

### imports

In [23]:
import pandas as pd, dateutil.parser as dp
import os, requests, datetime, time, json
from sseclient import SSEClient as EventSource

### define function ```get_rc```

In [8]:
def get_rc(rc_list:list, params:dict, url: str, sesh) -> str:
    '''
    Inputs:  rc_list: list to be populated with recentchanges jsons
             params: dictionary of parameters for the API request
                 this fn expects at least these parameters:
                    'rcprop' : 'timestamp|ids', (more is okay)
                    'action' : 'query',
                    'rcdir'  : 'newer',
                    'format' : 'json',
                    'list'   : 'recentchanges',
             url: API url (designates which wiki)
             sesh: requests session
    Outputs: timestamp of latest  
    '''
    raw_output= sesh.get(url=url, params=params)
    json_data = raw_output.json()
    recent_changes = json_data['query']['recentchanges']
    rc_list.append(recent_changes)
    timestamps = [rc['timestamp'] for rc in recent_changes]
    timestamps = sorted(map(dp.isoparse, timestamps))
    ts = timestamps[-3]
    return ts.strftime('%Y-%m-%dT%H:%M:%SZ')


## Collect 500 recent changes

### get ```rc_list```

#### initialize requests session

In [None]:
sesh = requests.Session()

#### set parameters

In [9]:
rc_list=[]
url = 'https://en.wikipedia.org/w/api.php'
params = {
    'rcstart'     : '2021-10-20T00:30:01Z',
    'rcnamespace' : '0',
    'rcshow'      : '!bot',
    'rclimit'     : '50',
    
    'rcprop': 'user|userid|timestamp|title|ids|sizes',
    
    'action'      : 'query',
    'rcdir'       : 'newer',
    'format'      : 'json',
    'list'        : 'recentchanges',
}

# Dictionary keys that output from these parameters:
['timestamp', 'type', 'title', 'anon', 'rcid', 'ns', 'revid', 'pageid', 'user', 'userid', 'oldlen', 'old_revid', 'newlen'];

#### populate rc_list

In [14]:
for i in range(10):
    latest_timestamp = get_rc(rc_list, params, url, sesh)
    params['rcstart'] = latest_timestamp
    print(f'{i} {latest_timestamp}')
    time.sleep(.5)

0 2022-01-29T11:08:54Z
1 2022-01-29T11:09:35Z
2 2022-01-29T11:10:19Z
3 2022-01-29T11:10:59Z
4 2022-01-29T11:11:42Z
5 2022-01-29T11:12:29Z
6 2022-01-29T11:13:16Z
7 2022-01-29T11:13:58Z
8 2022-01-29T11:14:22Z
9 2022-01-29T11:14:36Z


### peek at ```rc_list```

#### check dimensions of nested list

In [29]:
len(rc_list), len(rc_list[0]), len(rc_list[0][0])

(10, 50, 13)

#### look at one JSON element of nested list

In [30]:
rc_list[0][0]

{'type': 'edit',
 'ns': 0,
 'title': 'Graham Smorgon',
 'pageid': 4862734,
 'revid': 1068608616,
 'old_revid': 1068608597,
 'rcid': 1468648083,
 'user': '138.88.70.179',
 'userid': 0,
 'anon': '',
 'oldlen': 3638,
 'newlen': 3841,
 'timestamp': '2022-01-29T11:08:15Z'}

#### timestamp of latest recentchanges record

In [17]:
latest_timestamp

'2022-01-29T11:14:36Z'

### flatten rc_list to get unique_jsons

#### flatten

In [31]:
# flatten the jsons
all_jsons = [item for sublist in rc_list for item in sublist]
# remove jsons with duplicate rcid's
all_rcids = {j['rcid']:i for i,j in enumerate(all_jsons)}
unique_jsons = [all_jsons[i] for i in all_rcids.values()]

#### make dataframe

In [37]:
df = pd.DataFrame.from_records(unique_jsons)

#### peek at jsons as dataframe

In [38]:
df

Unnamed: 0,type,ns,title,pageid,revid,old_revid,rcid,user,userid,anon,oldlen,newlen,timestamp
0,edit,0,Graham Smorgon,4862734,1068608616,1068608597,1468648083,138.88.70.179,0,,3638,3841,2022-01-29T11:08:15Z
1,edit,0,Ashleigh Barty,32296594,1068608622,1068606873,1468648089,Dddenilson,31482225,,126917,127005,2022-01-29T11:08:15Z
2,edit,0,List of concept albums,20331466,1068608625,1068596286,1468648094,Swiftiekaghorl?13,40871152,,186520,186456,2022-01-29T11:08:15Z
3,edit,0,2022 Formula 4 UAE Championship,68600642,1068608617,1068608567,1468648084,81.38.222.220,0,,30039,30040,2022-01-29T11:08:16Z
4,edit,0,Domestic roof construction,4635477,1068608620,1060518900,1468648090,Prestigeroofingguttering,43279559,,13720,23173,2022-01-29T11:08:17Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...
464,log,0,Index of Sri Lanka-related articles (W-X),56168730,0,0,1468649482,Premeditated Chaos,31530,,0,0,2022-01-29T11:14:36Z
465,log,0,Index of Sri Lanka-related articles (Q-R),56168715,0,0,1468649485,Premeditated Chaos,31530,,0,0,2022-01-29T11:14:36Z
466,log,0,Index of Sri Lanka-related articles (U-V),56168707,0,0,1468649486,Premeditated Chaos,31530,,0,0,2022-01-29T11:14:36Z
467,edit,0,Japan eNational football team,69608853,1068609240,1068608739,1468649494,BRICK93,29032402,,10328,10318,2022-01-29T11:14:36Z


In [14]:
# df.to_csv('../data/interim/2021-10-20T00:30:01Z_2021-10-20T01:39:40Z.csv')

## Build SQL schema for import into database

### This is for postgreSQL; <mark>(Update for MySQL)</mark>

#### make "create table" query

#### connect to database

#### create table and import data

#### update table to have date 

#### SQLAlchemy

#### More detailed requests from wikipedia