In [17]:
import sqlalchemy as db
import os
import config
import pandas as pd
import numpy as np
pd.__version__

'0.24.2'

In [18]:
with open(os.path.expanduser("~/.pgpass"), 'r') as f:
    secrets = None
    for line in f:
        if config.wimbledon_config['host'] in line:
            secrets = line.strip().split(':')
            break
    
if secrets is None:
    raise ValueError('did not find '+config.wimbledon_config['host']+' in ~/.pgpass')

In [2]:
url = db.engine.url.URL(drivername=config.wimbledon_config['drivername'], 
                        username=secrets[-2], 
                        password=secrets[-1],
                        host=config.wimbledon_config['host'], 
                        database=config.wimbledon_config['database'])

engine = db.create_engine(url)

connection = engine.connect()
metadata = db.MetaData(schema='forecast')

In [16]:
def csv_to_sql(table_name, usecols, parse_dates, ints_with_nan, index_col='id'):
    csv = pd.read_csv('../data/forecast/'+table_name+'.csv',
                      usecols=usecols,
                      index_col=index_col,
                      parse_dates=parse_dates,
                      infer_datetime_format=True)
    
    for col in ints_with_nan:
        # Integer columns with NaN: Requires pandas 0.24 (otherwise ids end up as floats)
        csv[col] = csv[col].astype('Int64')
        
    if 'roles' in usecols:
        csv['roles'] = csv['roles'].str.replace(r"[\[\]\']", "", regex=True)
        csv.rename(columns={'roles':'role'},inplace=True)
        
        roles = pd.read_csv('../data/forecast/roles.csv')
        csv['role'] = pd.merge(csv['role'], 
                               roles[['name','id']], 
                               how='left', left_on='role', right_on='name')['id']
        
        csv['role'] = csv['role'].astype('Int64')
        
    csv.columns = csv.columns.str.replace('.','_')
        
    csv.to_sql(table_name, connection, schema='forecast', if_exists='append')
    
    table = db.Table(table_name, metadata, autoload=True, autoload_with=engine)
    results = connection.execute(db.select([table])).fetchall()
    df = pd.DataFrame(results)
    df.columns = results[0].keys()
    
    print(repr(metadata.tables['forecast.'+table_name]))
    display(df.head())

In [9]:
csv_to_sql('clients', 
           usecols=['id', 'name', 'harvest_id', 'archived'],
           parse_dates=False,
           ints_with_nan=['harvest_id'],
           index_col='id')

IntegrityError: (psycopg2.errors.UniqueViolation) duplicate key value violates unique constraint "clients_pkey"
DETAIL:  Key (id)=(732424) already exists.
 [SQL: 'INSERT INTO forecast.clients (id, archived, harvest_id, name) VALUES (%(id)s, %(archived)s, %(harvest_id)s, %(name)s)'] [parameters: ({'id': 732424, 'archived': False, 'harvest_id': None, 'name': 'The Alan Turing Institute'}, {'id': 732425, 'archived': False, 'harvest_id': None, 'name': 'Intel'}, {'id': 744985, 'archived': False, 'harvest_id': None, 'name': 'Wrattler'}, {'id': 745536, 'archived': False, 'harvest_id': None, 'name': 'Wilfrid Kendall (Warwick University)'}, {'id': 745537, 'archived': False, 'harvest_id': None, 'name': 'HSBC'}, {'id': 745538, 'archived': False, 'harvest_id': None, 'name': 'TMF'}, {'id': 745539, 'archived': False, 'harvest_id': None, 'name': 'Turing'}, {'id': 745540, 'archived': False, 'harvest_id': 7236211, 'name': 'Hut 23'}  ... displaying 10 of 29 total bound parameter sets ...  {'id': 829222, 'archived': False, 'harvest_id': None, 'name': 'UNAVAILABLE'}, {'id': 833514, 'archived': False, 'harvest_id': 7889179, 'name': 'Wellcome Trust'})] (Background on this error at: http://sqlalche.me/e/gkpj)

In [17]:
csv_to_sql('projects', 
           usecols=['id', 'name', 'code', 'start_date', 'end_date', 'client_id', 'harvest_id', 'notes', 'archived'],
           parse_dates=['start_date', 'end_date'],
           ints_with_nan=['client_id','harvest_id'],
           index_col='id')

IntegrityError: (psycopg2.errors.UniqueViolation) duplicate key value violates unique constraint "projects_pkey"
DETAIL:  Key (id)=(1684536) already exists.
 [SQL: 'INSERT INTO forecast.projects (id, archived, client_id, code, end_date, harvest_id, name, notes, start_date) VALUES (%(id)s, %(archived)s, %(client_id)s, %(code)s, %(end_date)s, %(harvest_id)s, %(name)s, %(notes)s, %(start_date)s)'] [parameters: ({'id': 1684536, 'archived': False, 'client_id': None, 'code': None, 'end_date': None, 'harvest_id': None, 'name': 'Time Off', 'notes': None, 'start_date': None}, {'id': 1684539, 'archived': False, 'client_id': 769477, 'code': 'R-INT-001', 'end_date': datetime.datetime(2018, 12, 31, 0, 0), 'harvest_id': 18266019, 'name': 'Sargasso (Intel Sparse)', 'notes': None, 'start_date': datetime.datetime(2018, 4, 1, 0, 0)}, {'id': 1723251, 'archived': False, 'client_id': 761645, 'code': None, 'end_date': datetime.datetime(2019, 3, 31, 0, 0), 'harvest_id': 18644291, 'name': 'AIDA', 'notes': None, 'start_date': datetime.datetime(2018, 4, 1, 0, 0)}, {'id': 1723252, 'archived': False, 'client_id': 761647, 'code': None, 'end_date': datetime.datetime(2019, 1, 31, 0, 0), 'harvest_id': 18644652, 'name': 'DetectorChecker', 'notes': None, 'start_date': datetime.datetime(2018, 8, 20, 0, 0)}, {'id': 1723253, 'archived': False, 'client_id': 769469, 'code': 'R-HSB-001', 'end_date': datetime.datetime(2018, 10, 31, 0, 0), 'harvest_id': 18644656, 'name': 'HSBC EDS', 'notes': None, 'start_date': datetime.datetime(2018, 4, 1, 0, 0)}, {'id': 1723254, 'archived': False, 'client_id': 769480, 'code': 'R-TMF-001', 'end_date': datetime.datetime(2019, 6, 2, 0, 0), 'harvest_id': 18644680, 'name': 'AI for City Planning', 'notes': None, 'start_date': datetime.datetime(2018, 8, 1, 0, 0)}, {'id': 1723255, 'archived': False, 'client_id': 761647, 'code': None, 'end_date': datetime.datetime(2018, 10, 31, 0, 0), 'harvest_id': 18644962, 'name': 'Reproducible Research', 'notes': None, 'start_date': datetime.datetime(2018, 4, 1, 0, 0)}, {'id': 1723256, 'archived': False, 'client_id': 745540, 'code': None, 'end_date': None, 'harvest_id': None, 'name': 'Training and Conferences', 'notes': None, 'start_date': None}  ... displaying 10 of 90 total bound parameter sets ...  {'id': 1969408, 'archived': False, 'client_id': 784816, 'code': None, 'end_date': datetime.datetime(2019, 12, 31, 0, 0), 'harvest_id': 20431477, 'name': 'Urban systems resilience', 'notes': None, 'start_date': datetime.datetime(2019, 10, 1, 0, 0)}, {'id': 2000710, 'archived': False, 'client_id': 745540, 'code': None, 'end_date': None, 'harvest_id': 20684616, 'name': 'Wimbledon Planner', 'notes': None, 'start_date': None})] (Background on this error at: http://sqlalche.me/e/gkpj)

In [6]:
csv_to_sql('roles', 
           usecols=['id', 'name', 'harvest_role_id'],
           parse_dates=False,
           ints_with_nan=['harvest_role_id'],
           index_col='id')

IntegrityError: (psycopg2.errors.UniqueViolation) duplicate key value violates unique constraint "roles_pkey"
DETAIL:  Key (id)=(170095) already exists.
 [SQL: 'INSERT INTO forecast.roles (id, harvest_role_id, name) VALUES (%(id)s, %(harvest_role_id)s, %(name)s)'] [parameters: ({'id': 170095, 'harvest_role_id': 450792, 'name': 'Research Data Scientist'}, {'id': 170096, 'harvest_role_id': 450793, 'name': 'Senior Research Software Engineer'}, {'id': 170097, 'harvest_role_id': 450794, 'name': 'Senior Research Data Scientist'}, {'id': 170099, 'harvest_role_id': 450796, 'name': 'Principal Research Data Scientist'}, {'id': 170100, 'harvest_role_id': 450797, 'name': 'Director of Research Engineering'}, {'id': 170104, 'harvest_role_id': 450798, 'name': 'Principal Research Software Engineer'}, {'id': 177204, 'harvest_role_id': 463110, 'name': 'Research Software Engineer'}, {'id': 194820, 'harvest_role_id': 493301, 'name': 'Project Manager'}, {'id': 194821, 'harvest_role_id': 493302, 'name': 'Research Project Manager'}, {'id': 195815, 'harvest_role_id': 495088, 'name': 'example'})] (Background on this error at: http://sqlalche.me/e/gkpj)

In [18]:
csv_to_sql('people', 
           usecols=['id', 'first_name', 'last_name', 'email', 'roles', 'harvest_user_id', 'login', 'subscribed', 'admin',
                   'archived', 'weekly_capacity', 'working_days.monday', 'working_days.tuesday', 'working_days.wednesday',
                   'working_days.thursday', 'working_days.friday', 'working_days.saturday', 'working_days.sunday'],
           parse_dates=False,
           ints_with_nan=['harvest_user_id'],
           index_col='id')

Table('people', MetaData(bind=None), Column('id', INTEGER(), table=<people>, primary_key=True, nullable=False), Column('first_name', TEXT(), table=<people>), Column('last_name', TEXT(), table=<people>), Column('email', TEXT(), table=<people>), Column('role', INTEGER(), table=<people>), Column('harvest_user_id', INTEGER(), table=<people>), Column('login', TEXT(), table=<people>), Column('subscribed', BOOLEAN(), table=<people>), Column('admin', BOOLEAN(), table=<people>), Column('archived', BOOLEAN(), table=<people>), Column('weekly_capacity', INTEGER(), table=<people>), Column('working_days_monday', BOOLEAN(), table=<people>), Column('working_days_tuesday', BOOLEAN(), table=<people>), Column('working_days_wednesday', BOOLEAN(), table=<people>), Column('working_days_thursday', BOOLEAN(), table=<people>), Column('working_days_friday', BOOLEAN(), table=<people>), Column('working_days_saturday', BOOLEAN(), table=<people>), Column('working_days_sunday', BOOLEAN(), table=<people>), schema='fo

Unnamed: 0,id,first_name,last_name,email,role,harvest_user_id,login,subscribed,admin,archived,weekly_capacity,working_days_monday,working_days_tuesday,working_days_wednesday,working_days_thursday,working_days_friday,working_days_saturday,working_days_sunday
0,399979,Oliver,Strickson,ostrickson@turing.ac.uk,,2315537.0,enabled,True,True,False,,True,True,True,True,True,False,False
1,408178,Angus,Williams,awilliams@turing.ac.uk,,,disabled,False,False,False,,True,True,True,True,True,False,False
2,408179,David,Beavan,dbeavan@turing.ac.uk,,2360118.0,enabled,False,True,False,,True,True,True,True,True,False,False
3,408180,Evelina,Gabasova,egabasova@turing.ac.uk,,2360119.0,enabled,False,True,False,,True,True,True,True,True,False,False
4,408181,Giovanni,Colavizza,gcolavizza@turing.ac.uk,,2360121.0,enabled,False,True,False,,True,True,True,True,True,False,False


In [19]:
csv_to_sql('placeholders', 
           usecols=['id', 'name', 'roles', 'archived'],
           parse_dates=False,
           ints_with_nan=[],
           index_col='id')

Table('placeholders', MetaData(bind=None), Column('id', INTEGER(), table=<placeholders>, primary_key=True, nullable=False), Column('name', TEXT(), table=<placeholders>), Column('role', INTEGER(), table=<placeholders>), Column('archived', BOOLEAN(), table=<placeholders>), schema='forecast')


Unnamed: 0,id,name,role,archived
0,23092,Resource Required 1,,False
1,23093,Newcastle 01,,False
2,23094,Edinburgh 01,,False
3,24360,Birmingham 01,,False
4,24361,SPF RA 01,,False


In [20]:
csv_to_sql('milestones', 
           usecols=['id', 'date', 'project_id'],
           parse_dates=['date'],
           ints_with_nan=['project_id'],
           index_col='id')

Table('milestones', MetaData(bind=None), Column('id', INTEGER(), table=<milestones>, primary_key=True, nullable=False), Column('date', DATE(), table=<milestones>), Column('project_id', INTEGER(), ForeignKey('forecast.projects.id'), ForeignKey('forecast.projects.id'), table=<milestones>), schema='forecast')


Unnamed: 0,id,date,project_id
0,1727379,2018-08-30,1684539
1,1906966,2019-01-02,1823898
2,2088931,2019-03-19,1969208


In [21]:
csv_to_sql('assignments', 
           usecols=['id', 'person_id', 'placeholder_id', 'project_id', 'start_date','end_date','allocation','notes'],
           parse_dates=['start_date','end_date'],
           ints_with_nan=['person_id','placeholder_id','project_id'],
           index_col='id')

Table('assignments', MetaData(bind=None), Column('id', INTEGER(), table=<assignments>, primary_key=True, nullable=False), Column('person_id', INTEGER(), ForeignKey('forecast.people.id'), table=<assignments>), Column('placeholder_id', INTEGER(), ForeignKey('forecast.placeholders.id'), ForeignKey('forecast.placeholders.id'), ForeignKey('forecast.placeholders.id'), ForeignKey('forecast.placeholders.id'), table=<assignments>), Column('project_id', INTEGER(), ForeignKey('forecast.projects.id'), ForeignKey('forecast.projects.id'), table=<assignments>), Column('start_date', DATE(), table=<assignments>), Column('end_date', DATE(), table=<assignments>), Column('allocation', INTEGER(), table=<assignments>), Column('notes', TEXT(), table=<assignments>), schema='forecast')


Unnamed: 0,id,person_id,placeholder_id,project_id,start_date,end_date,allocation,notes
0,19314451,408190.0,,1723252,2018-11-01,2018-11-30,14400,
1,19314475,408190.0,,1723263,2018-11-01,2018-12-31,14400,
2,19314509,408187.0,,1723263,2018-11-01,2018-12-31,14400,
3,19314518,408187.0,,1723251,2018-11-01,2019-03-31,14400,
4,19314525,408186.0,,1723251,2018-11-01,2019-03-31,14400,
