In [3]:
import os
import pandas as pd
import numpy as np
import datetime
import psycopg2

In [4]:
def get_db_connex():
    conn = psycopg2.connect(host="localhost", database="crosstown")
    return conn.cursor()

In [5]:
columns = [
    'AD',
    'ED',
    'County',
    'EDAD Status',
    'Event',
    'Party/Independent Body',
    'Office/Position Title',
    'District Key',
    'VoteFor',
    'Unit Name',
    'Tally'
]

In [6]:
def load_data(filepath):
    return pd.read_csv(
        filepath, 
        names = columns,
        usecols=list(range(11,22))
    )

def data_to_db(df):
    df.to_sql(name='results', con=conn, schema='public')
    
def data_from_db(cur, query):
    cur.execute(query)
    return cur.fetchall()
    
def write_metadata(bod_id, df):
    date = get_date(df)
    office = get_office(df)
    district = get_district(df)
    rows = len(df)
    sql_query = ("INSERT INTO datasets(bod_id, date, office, district_key, rows) VALUES ('"+
        bod_id+
        "', to_date('"+
        date+
        "'::text, 'YYYY-MM-DD'),'"+
        office+
        "','"+
        str(district)+
        "',"+
        str(rows)+
        ");"
    )
    query = 'psql -h localhost crosstown -c "' + sql_query + '"'
    print('** write metadata', query)
    os.system(query)

def write_data(bod_id, df):
    data_path = get_path(bod_id, df)
    full_path = './data/clean/'+data_path
    df.to_csv(full_path, index=False)
#     print('** skip write_data')
    query = ('cat '+
        full_path+
        ' | psql -h localhost crosstown -c "COPY results(ad, ed, county, edad_status, event, party, office, district_key, vote_for, unit_name, tally) from stdin CSV HEADER"'
    )
    print('** write data', query)    
    os.system(query)
    
def get_path(bod_id, df):
    event = df['Event'][0].split(' - ')
    return event[1][-4:] + '/' + bod_id + '.csv'

def get_date(df):
    event = df['Event'][0].split(' - ')
    return str(datetime.datetime.strptime(event[1][-10:], '%m/%d/%Y').date())

def get_office(df):
    return df["Office/Position Title"][0]

def get_district(df):
    return df["District Key"][0]

def control(raw_file):
    data = load_data('./data/raw/'+raw_file)
    bod_id = raw_file[0:11]
    write_data(bod_id, data)
    write_metadata(bod_id, data)

In [7]:
## https://vote.nyc/page/election-results-summary
cur = get_db_connex()
files = os.listdir('./data/raw')
loaded_files_bod_ids = set(map(lambda x: x[0], data_from_db(cur, """SELECT * FROM datasets""")))
unloaded_files = list(filter(lambda x: x[0:11] not in loaded_files_bod_ids, files))

In [8]:
len(unloaded_files)

0

In [12]:
for f in unloaded_files:
    print('* starting file ', f)
    control(f)

* starting file  00102300071New York Member of the Assembly 71st Assembly District EDLevel.csv
** skip write_data
** write metadata psql -h localhost crosstown -c "INSERT INTO datasets(bod_id, date, office, district_key, rows) VALUES ('00102300071', to_date('2018-11-06'::text, 'YYYY-MM-DD'),'Member of the Assembly','71',823);"
* starting file  00102300068New York Member of the Assembly 68th Assembly District EDLevel.csv
** skip write_data
** write metadata psql -h localhost crosstown -c "INSERT INTO datasets(bod_id, date, office, district_key, rows) VALUES ('00102300068', to_date('2018-11-06'::text, 'YYYY-MM-DD'),'Member of the Assembly','68',944);"
* starting file  00101200000New York Public Advocate Citywide EDLevel.csv
** skip write_data
** write metadata psql -h localhost crosstown -c "INSERT INTO datasets(bod_id, date, office, district_key, rows) VALUES ('00101200000', to_date('2019-02-26'::text, 'YYYY-MM-DD'),'Public Advocate','NYC',18324);"
* starting file  00102100028New York S

** skip write_data
** write metadata psql -h localhost crosstown -c "INSERT INTO datasets(bod_id, date, office, district_key, rows) VALUES ('00102300076', to_date('2018-11-06'::text, 'YYYY-MM-DD'),'Member of the Assembly','76',894);"
* starting file  00102100030New York State Senator 30th Senatorial District EDLevel.csv
** skip write_data
** write metadata psql -h localhost crosstown -c "INSERT INTO datasets(bod_id, date, office, district_key, rows) VALUES ('00102100030', to_date('2018-11-06'::text, 'YYYY-MM-DD'),'State Senator','30',2026);"
* starting file  00102401006New York Judge of the Civil Court - District 6th Municipal Court District - New York EDLevel.csv
** skip write_data
** write metadata psql -h localhost crosstown -c "INSERT INTO datasets(bod_id, date, office, district_key, rows) VALUES ('00102401006', to_date('2018-11-06'::text, 'YYYY-MM-DD'),'Judge of the Civil Court - District','1006',1219);"
* starting file  01102300072New York Democratic Member of the Assembly 72nd A

In [13]:
cur.close()