In [65]:
import os
from pathlib import Path
import pandas as pd
import psycopg2
import sqlalchemy

In [30]:
home_path = str(Path.home())
os.chdir(home_path + '/git/predwikt/notebooks')

In [58]:
df = pd.read_csv('../data/interim/2021-10-20T00:30:01Z_2021-10-20T01:39:40Z.csv')
df = df.drop(columns=['Unnamed: 0'])  # duplicate of the index
df = df[['rcid','revid']]

In [59]:
df.head(3)

Unnamed: 0,rcid,revid
0,1434690429,1050804743
1,1434690426,1050804745
2,1434690423,1050804744


### Connect to postgresql

In [32]:
dbhost = 'localhost'
dbname = 'wiki_01'
dbuname = 'postgres'
dbpassword = 'foubarre'
dbport='5432'

In [81]:
try:
    conn = psycopg2.connect("host="+dbhost+" dbname="+dbname+" user="+dbuname+" password="+dbpassword+' port='+dbport)
    cur = conn.cursor()
except:
    print('Database connection error - check creds')

In [None]:
# conn.close()

### Next Step

In [None]:
engine = sqlalchemy.create_engine('postgresql://postgres:foubarre@localhost:5432/wiki_01')

In [85]:
df.to_sql('test_table02', engine)
conn.commit()

In [88]:
x = engine.execute('''select * from test_table01''')

In [93]:
x.all();

In [94]:
x.close()

In [96]:
cur.close()
conn.close()

In [79]:
cur.execute('''SELECT * FROM test_table01;''')

In [99]:
# cur.fetchall()

### CREATE TABLE in postgresql:
- **Roughly** recreate the recent_changes table schema
    - adjusting Types for convenience
    - not all fields are available

https://www.mediawiki.org/wiki/Manual:Recentchanges_table

```
+---------------+---------------------+------+-----+---------+----------------+
| Field         | Type                | Null | Key | Default | Extra          |
+---------------+---------------------+------+-----+---------+----------------+
| rc_id         | int(10) unsigned    | NO   | PRI | NULL    | auto_increment |
| rc_timestamp  | binary(14)          | NO   | MUL | NULL    |                |
| rc_actor      | bigint(20) unsigned | NO   | MUL | NULL    |                |
| rc_namespace  | int(11)             | NO   | MUL | 0       |                |
| rc_title      | varbinary(255)      | NO   |     |         |                |
| rc_comment_id | bigint(20) unsigned | NO   |     | NULL    |                |
| rc_minor      | tinyint(3) unsigned | NO   |     | 0       |                |
| rc_bot        | tinyint(3) unsigned | NO   |     | 0       |                |
| rc_new        | tinyint(3) unsigned | NO   | MUL | 0       |                |
| rc_cur_id     | int(10) unsigned    | NO   | MUL | 0       |                |
| rc_this_oldid | int(10) unsigned    | NO   | MUL | 0       |                |
| rc_last_oldid | int(10) unsigned    | NO   |     | 0       |                |
| rc_type       | tinyint(3) unsigned | NO   |     | 0       |                |
| rc_source     | varbinary(16)       | NO   |     |         |                |
| rc_patrolled  | tinyint(3) unsigned | NO   |     | 0       |                |
| rc_ip         | varbinary(40)       | NO   | MUL |         |                |
| rc_old_len    | int(11)             | YES  |     | NULL    |                |
| rc_new_len    | int(11)             | YES  |     | NULL    |                |
| rc_deleted    | tinyint(3) unsigned | NO   |     | 0       |                |
| rc_logid      | int(10) unsigned    | NO   |     | 0       |                |
| rc_log_type   | varbinary(255)      | YES  |     | NULL    |                |
| rc_log_action | varbinary(255)      | YES  |     | NULL    |                |
| rc_params     | blob                | YES  |     | NULL    |                |
+---------------+---------------------+------+-----+---------+----------------+
```

In [None]:
# # Define Schema
# sql_create_table = """
# DROP TABLE IF EXISTS wiki02;

# CREATE TABLE wiki02(
#     rc_id BIGINT CONSTRAINT rc_id_constr PRIMARY KEY,
#     rev_id BIGINT
# );
# """

### Template code from Geoff Pidcock (General Assembly / AU)

In [None]:
# Define Schema
sql_create_table = """
DROP TABLE IF EXISTS data_raw;

CREATE TABLE data_raw(
    row_index SERIAL,
    time_string char varying(25),
    unix_time bigint,
    instance char varying(35),
    product char varying(5),
    username char varying(35),
    event char varying(100),
    attributes text
);
"""

In [None]:
# connect to database
try:
    conn = psycopg2.connect("host="+dbhost+" dbname="+dbname+" user="+dbuname+" password="+dbpassword)
    cur = conn.cursor()
except:
    print('Database connection error - check creds')

In [None]:
%%time
# Create table and import data
cur.execute(sql_create_table)
sql_import = "COPY data_raw(time_string,unix_time,instance,product,username,event,attributes) FROM STDIN DELIMITER E'\t';"
cur.copy_expert(sql_import, open('jira_clean.tsv', "r",encoding="utf8"))
conn.commit()

In [None]:
# update table to have date 
sql_calc_table = """
DROP TABLE IF EXISTS data_prep;

CREATE TABLE data_prep as (
    select 
        row_index
        , to_timestamp(time_string,'YYYY-MM-DD HH24:MI,MS')::timestamp without time zone as time_parsed
        , unix_time
        , instance
        , username
        , event
        , attributes
    from
        data_raw
);
"""
cur.execute(sql_calc_table)
conn.commit()

In [None]:
# Finally, let's instantiate a SQL alchemy engine, so we can pass results sets into pandas and evaluate them here 
connection_str = 'postgresql+psycopg2://'+dbuname+':'+dbpassword+'@'+dbhost+':'+dbport+'/'+dbname
try:
    engine1 = sqlalchemy.create_engine(connection_str)
    conn1 = engine1.connect()
except:
    print('Database connection error - check creds')
engine1.table_names() # Confirm connection and tables are present as expect