In [3]:
from pathlib import Path
from datetime import datetime
from decimal import Decimal
from collections import namedtuple
import pandas as pd
import pytz

In [2]:
UTC = pytz.UTC

In [47]:
WEATHER_COLS = [
    'station', 'record_datetime',
    'temperature', 'dew_temperature',
    'relative_humidity', 'wind_direction', 'wind_speed',
    'one_hour_acc_precipitation', 'visibility',
    'wind_gust_speed'
]

DECIMAL_FIELDS = [
    'temperature', 'dew_temperature',
    'relative_humidity', 'wind_direction', 'wind_speed',
    'one_hour_acc_precipitation', 'visibility',
    'wind_gust_speed'
]

In [84]:
to_na_decimal = lambda text: Decimal('NaN') if text == 'M' else Decimal(text)
to_utc_date = lambda text: UTC.localize(datetime.strptime(text, '%Y-%m-%d %H:%M'))

def read_weather(csv_pth, chunksize=100000):
    return pd.read_table(
        csv_pth,
        skiprows=5,
        header=0,
        names=WEATHER_COLS,
        parse_dates=False, index_col=False,
        converters={
            'station': str.encode,
            'record_datetime': to_utc_date,
            **{col: to_na_decimal for col in DECIMAL_FIELDS},
        },
        chunksize=chunksize
    )

In [54]:
import psycopg2
from pgcopy import CopyManager, Replace
from io import BytesIO, StringIO

In [86]:
conn = psycopg2.connect(database='liang-bo.wang_project1')

In [87]:
mgr = CopyManager(conn, 'weather', ['id', *WEATHER_COLS])

In [88]:
%%time
for df in read_weather('../raw_weather_data/ny_weather_2015-2016.csv'):
    records = list(df.itertuples(index=True, name='Weather'))
    with conn:
        mgr.copy(records, BytesIO)