In [1]:
import os
import pandas as pd
import numpy as np
import datetime
import psycopg2

In [2]:
def get_db_connex():
    conn = psycopg2.connect(host="localhost", database="crosstown")
    return conn.cursor()

In [3]:
columns = [
    'AD',
    'ED',
    'County',
    'EDAD Status',
    'Event',
    'Party/Independent Body',
    'Office/Position Title',
    'District Key',
    'VoteFor',
    'Unit Name',
    'Tally'
]

In [31]:
def load_data(filepath, cols = list(range(11,22)), header = None):
    return pd.read_csv(
        filepath, 
        names = columns,
        usecols = cols,
        header = header
    )

def data_to_db(df):
    df.to_sql(name='results', con=conn, schema='public')
    
def data_from_db(cur, query):
    cur.execute(query)
    return cur.fetchall()
    
def write_metadata(bod_id, df):
    date = get_date(df)
    office = get_office(df)
    district = get_district(df)
    rows = len(df)
    sql_query = ("INSERT INTO datasets(bod_id, date, office, district_key, rows) VALUES ('"+
        bod_id+
        "', to_date('"+
        date+
        "'::text, 'YYYY-MM-DD'),'"+
        office+
        "','"+
        str(district)+
        "',"+
        str(rows)+
        ");"
    )
    query = 'psql -h localhost crosstown -c "' + sql_query + '"'
    print('** write metadata', query)
    res = os.system(query)
    print('** write metadata result', res)

def write_data(bod_id, df):
    data_path = get_path(bod_id, df)
    full_path = './data/clean/'+data_path
    df.to_csv(full_path, index=False)
#     print('** skip write_data')
    query = ('cat '+
        full_path+
        ' | psql -h localhost crosstown -c "COPY results(ad, ed, county, edad_status, event, party, office, district_key, vote_for, unit_name, tally) from stdin CSV HEADER"'
    )
    print('** write data', query)    
    res = os.system(query)
    print('** write data result', res)
    
def get_path(bod_id, df):
    event = df['Event'][0].split(' - ')
    return event[1][-4:] + '/' + bod_id + '.csv'

def get_date(df):
    event = df['Event'][0].split(' - ')
    return str(datetime.datetime.strptime(event[1][-10:], '%m/%d/%Y').date())

def get_office(df):
    return df["Office/Position Title"][0]

def get_district(df):
    return df["District Key"][0]

def control(raw_file, extraCols = False):
    bod_id = raw_file[0:11]
    
    if extraCols:
        cols = list(range(11,22))
        header = None
    else:
        cols = list(range(0, 11))
        header = 0

    data = load_data('./data/raw/'+raw_file, cols, header)
    if len(data) == 0:
        print("No data, exiting on ", bod_id)
        return
    date = get_date(data)
    if is_dataset_in_datasets(bod_id, date):
        print ("Already loaded, exiting on ", bod_id)
        return
    write_data(bod_id, data)
    write_metadata(bod_id, data)
    
def is_dataset_in_datasets(bod_id, date):
    res = data_from_db(
        cur, 
        """SELECT * FROM datasets WHERE bod_id = '"""+ bod_id +"""' and date = '"""+ date +"""' """
    )
    return len(res) > 0

In [33]:
## https://vote.nyc/page/election-results-summary
## CAUTION ID bod_id is unique by office, not by election date
## datasets is unique by bod_id / date
cur = get_db_connex()
files = list(filter(lambda x: x != '.DS_Store', os.listdir('./data/raw')))
loaded_files_bod_ids = map(lambda x: (x[0], str(x[1])), data_from_db(cur, """SELECT * FROM datasets"""))
files

['00050200000Citywide Permitting Electronic Distribution of State Legislative Bills Citywide EDLevel.csv']

In [34]:
for f in files:
    control(f, False)
# raw_file = files[fileIndex]
# extraCols = False
# bod_id = raw_file[0:11]
    
# if extraCols:
#     cols = list(range(11,22))
#     header = None
# else:
#     cols = list(range(0, 11))
#     header = 0

# data = load_data('./data/raw/'+raw_file, cols, header)
# date = get_date(data)
# is_dataset_in_datasets(bod_id, date)

** write data cat ./data/clean/2014/00050200000.csv | psql -h localhost crosstown -c "COPY results(ad, ed, county, edad_status, event, party, office, district_key, vote_for, unit_name, tally) from stdin CSV HEADER"
** write data result 256
** write metadata psql -h localhost crosstown -c "INSERT INTO datasets(bod_id, date, office, district_key, rows) VALUES ('00050200000', to_date('2014-11-04'::text, 'YYYY-MM-DD'),'Permitting Electronic Distribution of State Legislative Bills','NYC',37416);"
** write metadata result 256


In [303]:
files

['.DS_Store',
 '00102300065New York Member of the Assembly 65th Assembly District EDLevel.csv',
 '01102300065New York Democratic Member of the Assembly 65th Assembly District EDLevel.csv',
 '00150300000New York Authorizing the Use of Forest Preserve Land for Specified Purposes Citywide EDLevel.csv',
 "00150200000New York Allowing the complete or partial forfeiture of a public officer's pension if he or she is convicted of a certain type of felony Citywide EDLevel.csv"]

In [35]:
cur.close()