Fetching contributors…
Cannot retrieve contributors at this time
192 lines (162 sloc) 6.57 KB
#!/usr/bin/env python
# -*- coding: utf-8 -*-
Load clean CAL-ACCESS CSV file into a database model.
from __future__ import unicode_literals
# Files
import os
from csvkit import reader
# Django config
from django.apps import apps
from django.conf import settings
from django.utils.timezone import now
# Database
from django.db import connections, router
from calaccess_raw.models.tracking import RawDataFile
# Commands
from import CommandError
from import CalAccessCommand
class Command(CalAccessCommand):
Load clean CAL-ACCESS CSV file into a database model.
help = 'Load clean CAL-ACCESS CSV file into a database model'
def add_arguments(self, parser):
Adds custom arguments specific to this command.
super(Command, self).add_arguments(parser)
# positional (required) arg
help="Name of the model into which data will be loaded"
# keyword (optional) args
help="Path to comma-delimited file to be loaded. Defaults to one associated with model."
help="Keep clean CSV file after loading"
help="Name of Django app with models into which data will be imported (if other not calaccess_raw)"
def handle(self, *args, **options):
Make it happen.
super(Command, self).handle(*args, **options)
# set / compute any attributes that multiple class methods need
self.keep_file = options["keep_file"]
# get model based on strings of app_name and model_name
self.model = apps.get_model(options["app_name"], options['model_name'])
# load from provided csv or csv mapped to model
self.csv = options["csv"] or self.model.objects.get_csv_path()
# load into database suggested for model by router
self.database = router.db_for_write(model=self.model)
# get most recently cleaned RawDataFile
raw_file = RawDataFile.objects.filter(
except RawDataFile.DoesNotExist:
raise CommandError(
'No record of cleaning {0}.TSV (run `python '
'cleancalaccessrawfile {0}`).'.format(self.model._meta.db_table)
# raise exception if clean step did not finish
if not raw_file.clean_finish_datetime:
raise CommandError(
'Previous cleaning of {0}.TSV did not finish (run `python '
'cleancalaccessrawfile {0}`).'.format(self.model._meta.db_table)
# Get the row count from the source CSV
with open(self.csv, 'r') as infile:
self.csv_row_count = sum(1 for line in infile) - 1
# Quit if the CSV is empty.
if self.csv_row_count == 0:
if self.verbosity > 2:
self.failure("{} is empty.".format(self.csv))
# Get the headers from the source CSV
with open(self.csv, 'r') as infile:
csv_reader = reader(infile)
self.csv_headers = next(csv_reader)
except StopIteration:
self.csv_headers = []
# store the start time for the load
raw_file.load_start_datetime = now()
# reset the finish time for the load
raw_file.load_finish_datetime = None
# save here in case command doesn't finish
# Load table
if self.verbosity > 2:
self.log(" Loading {}".format(options['model_name']))
# add load counts to raw_file_record
raw_file.load_columns_count = len(self.model._meta.fields)
raw_file.load_records_count = self.model.objects.count()
# Log an error if the counts don't match
if self.verbosity > 2 and raw_file.load_records_count != self.csv_row_count:
msg = " Table record count doesn't match CSV. {} in the table vs. {} in the CSV."
self.failure(msg.format(raw_file.load_records_count, self.csv_row_count))
# if not keeping files, remove the csv file
if not self.keep_file:
# store the finish time for the load
raw_file.load_finish_datetime = now()
# and save the RawDataFile
def load(self):
Loads the source CSV for the provided model based on settings and database connections.
# if not using default db, make sure the database is set up in django's settings
if self.database:
engine = settings.DATABASES[self.database]['ENGINE']
except KeyError:
raise TypeError("{} not configured in DATABASES settings.".format(self.database))
# set up database connection
self.connection = connections[self.database]
self.cursor = self.connection.cursor()
# check the kind of database before calling db-specific load method
if engine in (
self.failure("Sorry your database engine is unsupported")
raise CommandError("Only PostgresSQL backends supported.")
def load_postgresql(self):
Load the file into a PostgreSQL database using COPY.
# Drop all the records from the target model's real table
sql = 'TRUNCATE TABLE "{}" RESTART IDENTITY CASCADE'.format(self.model._meta.db_table)
# Create a mapping between our django models and the CSV headers
model_mapping = dict(
(, f.db_column) for f in self.model._meta.fields
if f.db_column
# Load the data
self.model.objects.from_csv(self.csv, model_mapping, using=self.database)