In [11]:
#Setup Notebook to load Django code
# From project root, run: jupyter-lab",

import os
import sys
import io
from pathlib import Path

django_project_dir = Path('.'),
sys.path.insert(0, str(django_project_dir)),
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "traffic_stops.settings.dev")
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

import django
django.setup()

In [12]:
import psycopg2
from psycopg2 import extras
import pandas
import time
from django.conf import settings
from sqlalchemy import create_engine

In [13]:
pghost = os.environ.get("PGHOST")
pgport = os.environ.get("PGPORT")
pguser = os.environ.get("PGUSER")
DB_URL = os.environ.get("DATABASE_URL_NC")

In [14]:
conn = psycopg2.connect(database="traffic_stops_nc", user=pguser, host=pghost, port=pgport)

In [15]:
stop_column_names = [
    'stop_id', 
    'agency_description', 
    'date', 
    'purpose', 
    'action', 
    'driver_arrest', 
    'passenger_arrest', 
    'encounter_force', 
    'engage_force', 
    'officer_injury', 
    'driver_injury', 
    'passenger_injury', 
    'officer_id', 
    'stop_location', 
    'stop_city'
]

person_column_names = [
    'person_id', 'stop_id', 'type', 'age', 'gender', 'ethnicity', 'race'
]

person_map_types = {
    'gender': str,
    'ethnicity': str,
    'race': str
}


stop_map_types = {
    'driver_arrest': bool, 
    'passenger_arrest': bool,
    'encounter_force': bool,
    'engage_force': bool,
    'officer_injury': bool,
    'driver_injury': bool,
    'passenger_injury': bool,
    'officer_id': str,
    'stop_location': str,
    'stop_city': str
}

In [23]:
stop_df = pandas.read_csv(
    Path("./ncdata/Stop.csv"), 
    delimiter=',', 
    chunksize=500000,  
    header=0, 
    names=stop_column_names, 
    dtype=stop_map_types,
    na_filter=False)

In [40]:
STOP_COPY = """COPY nc_stop (stop_id, agency_description, date, purpose, action, driver_arrest, passenger_arrest, encounter_force, engage_force, officer_injury, driver_injury, passenger_injury, officer_id, stop_location, stop_city) FROM STDIN WITH DELIMITER ',' NULL AS '' CSV HEADER FORCE NOT NULL officer_id, stop_city, stop_location;"""

In [44]:
TRUNCATE = """
TRUNCATE "nc_stop" RESTART IDENTITY CASCADE; 
TRUNCATE "nc_person" RESTART IDENTITY CASCADE; 
TRUNCATE "nc_search" RESTART IDENTITY CASCADE;
TRUNCATE "nc_searchbasis" RESTART IDENTITY CASCADE;
TRUNCATE "nc_contraband" RESTART IDENTITY CASCADE;
TRUNCATE "nc_agency" RESTART IDENTITY CASCADE;
"""

In [45]:
def iter_csv(table_name, data_frame, insert_stmt, cur):
    for df in data_frame:
        s_buf = io.StringIO()
        df.to_csv(s_buf)
        cur.copy_from(s_buf, table_name, sep=",", columns=stop_column_names)
        s_buf.close()
        print((time.time() - start) / 60)

def setup_column_names(data_frame, table_name, cols):
    columns = ",".join(cols)
    # create values template one '%s' per column
    values = "VALUES({})".format(",".join(["%s" for _ in cols])) 
    #create insert statement template
    insert_stmt = "INSERT INTO {} ({}) {}".format(table_name, columns, values)
    return insert_stmt

In [46]:
start = time.time()
with conn:
    with conn.cursor() as cur:
        cur.execute(TRUNCATE)
#         with Path("./ncdata/Stop.csv").open() as fh:
#             cur.copy_expert(STOP_COPY, fh)
#         insert_stmt = setup_column_names(stop_df, "nc_stop", stop_column_names)
#         iter_csv("nc_stop", stop_df, insert_stmt, cur)
print((time.time() - start) / 60)

0.0049084822336832685
