# Data: API➔Database

Source file headings

In [2]:
#!/usr/bin/python3
"""
    get_recent_changes.py

    MediaWiki API Demos
    Demo of `RecentChanges` module: Get the three most recent changes with
    sizes and flags

    MIT License
""";

In [46]:
import requests
import pandas as pd
import os
import datetime
import json
from sseclient import SSEClient as EventSource

In [None]:
import requests

S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"

PARAMS = {
    "rcstart" : '2021-11-01T00:00:01Z',
    "format": "json",
    "rcprop": "title|ids|sizes|flags|user",
    "list": "recentchanges",
    "action": "query",
    "rclimit": "50"
}

In [71]:
# List of wikipedias is used to filter wikipedia edits out from other wiki projects
wkps = pd.read_csv('../data/wikipedias.csv').assign(code=lambda x: x.abbrev + 'wiki')
wkps.columns

Index(['lang_EN', 'language', 'abbrev', 'code'], dtype='object')

In [86]:
# Set a start and end time to timebox data catching
now = datetime.datetime.now()
stop = now + datetime.timedelta(0,20)

# Initialize empty list
l=[]

url = 'https://stream.wikimedia.org/v2/stream/recentchange'
for event in EventSource(url):
    if datetime.datetime.now() > stop:
        break
    if event.event == 'message':
        try: 
            change = json.loads(event.data)
        except ValueError: 
            pass
        else:
            if (change['bot']==False) and (change['wiki'] in wkps.code.values):
                l.append(change)
                try:
                    chars_changed = change['length']['new'] - change['length']['old']
                except KeyError:
                    chars_changed = 'unknown # of'
                print('on {wiki}, {user} edited {title}: '.format(**change) +
                      f'{chars_changed} chars.')
                

on frwiki, LeFit edited Catégorie:Article contenant un appel à traduction en allemand: unknown # of chars.
on bgwiki, Rartat edited Иван Панфилов: 567 chars.
on viwiki, Nguyenquanghai19 edited Danh sách trò chơi Game Boy Color: -73 chars.
on idwiki, Radit Jibril Irawan edited Kategori:Semua artikel rintisan November 2021: unknown # of chars.
on enwiki, Gerda Arendt edited Template:Did you know nominations/Reginald Green (economist): 790 chars.
on trwiki, Blue800 edited Aziz (dizi): 0 chars.
on frwiki, Martin-78 edited Discussion utilisateur:84.226.190.202: unknown # of chars.
on jawiki, Iceandsnow7 edited Netflixオリジナル映画の一覧: -294 chars.
on idwiki, Radit Jibril Irawan edited Kategori:Semua artikel rintisan Oktober 2021: unknown # of chars.
on mywiki, Dr Lotus Black edited မဟာဗောဓိညောင်ပင်: 113 chars.
on eswiki, Copydays edited Conflicto de Baluchistán: 134 chars.
on hrwiki, Bibliotekacs edited Suradnik:Bibliotekacs/OER: 61 chars.
on enwiki, Gardenkur edited Abhayapuri South (Vidhan Sabha

### Get the schema
- such that:
    - all data from this sample "l"
    - will fit into the database

In [93]:
l[0]

{'$schema': '/mediawiki/recentchange/1.0.0',
 'meta': {'uri': 'https://fr.wikipedia.org/wiki/Cat%C3%A9gorie:Article_contenant_un_appel_%C3%A0_traduction_en_allemand',
  'request_id': '1b6212ee-e650-41fe-a014-08cf840d565a',
  'id': '50028f1f-96ba-4af8-8376-ab8a6f059c90',
  'dt': '2021-11-12T08:13:52Z',
  'domain': 'fr.wikipedia.org',
  'stream': 'mediawiki.recentchange',
  'topic': 'eqiad.mediawiki.recentchange',
  'partition': 0,
  'offset': 3428064477},
 'id': 452932834,
 'type': 'categorize',
 'namespace': 14,
 'title': 'Catégorie:Article contenant un appel à traduction en allemand',
 'comment': '[[:Orsten Groom]] ajoutée à la catégorie',
 'timestamp': 1636704832,
 'user': 'LeFit',
 'bot': False,
 'server_url': 'https://fr.wikipedia.org',
 'server_name': 'fr.wikipedia.org',
 'server_script_path': '/w',
 'wiki': 'frwiki',
 'parsedcomment': '<a href="/wiki/Orsten_Groom" title="Orsten Groom">Orsten Groom</a> ajoutée à la catégorie'}

In [None]:
# Define Schema
sql_create_table = """
DROP TABLE IF EXISTS data_raw;

CREATE TABLE data_raw(
    row_index SERIAL,
    time_string char varying(25),
    unix_time bigint,
    instance char varying(35),
    product char varying(5),
    username char varying(35),
    event char varying(100),
    attributes text
);
"""

In [None]:
# connect to database
try:
    conn = psycopg2.connect("host="+dbhost+" dbname="+dbname+" user="+dbuname+" password="+dbpassword)
    cur = conn.cursor()
except:
    print('Database connection error - check creds')

In [None]:
%%time
# Create table and import data
cur.execute(sql_create_table)
sql_import = "COPY data_raw(time_string,unix_time,instance,product,username,event,attributes) FROM STDIN DELIMITER E'\t';"
cur.copy_expert(sql_import, open('jira_clean.tsv', "r",encoding="utf8"))
conn.commit()

In [None]:
# update table to have date 
sql_calc_table = """
DROP TABLE IF EXISTS data_prep;

CREATE TABLE data_prep as (
    select 
        row_index
        , to_timestamp(time_string,'YYYY-MM-DD HH24:MI,MS')::timestamp without time zone as time_parsed
        , unix_time
        , instance
        , username
        , event
        , attributes
    from
        data_raw
);
"""
cur.execute(sql_calc_table)
conn.commit()

In [None]:
# Finally, let's instantiate a SQL alchemy engine, so we can pass results sets into pandas and evaluate them here 
connection_str = 'postgresql+psycopg2://'+dbuname+':'+dbpassword+'@'+dbhost+':'+dbport+'/'+dbname
try:
    engine1 = sqlalchemy.create_engine(connection_str)
    conn1 = engine1.connect()
except:
    print('Database connection error - check creds')
engine1.table_names() # Confirm connection and tables are present as expect

In [70]:
start_date_datetime = datetime.datetime.fromisoformat('2011-11-01T00:00:01')
print(start_date_datetime)
start_date_string = start_date_datetime.strftime('%Y-%m-%dT%H:%M:%SZ')
print(start_date_string)

2011-11-01 00:00:01
2011-11-01T00:00:01Z
