# Splitting by time

In [8]:
import pandas as pd
import numpy as np
import pytz
from datetime import datetime

eastern = pytz.timezone('US/Eastern')
dev_split = eastern.localize(datetime.strptime('Nov 14 2016 12:00AM', '%b %d %Y %I:%M%p'))
test_split = eastern.localize(datetime.strptime('Nov 21 2016 12:00AM', '%b %d %Y %I:%M%p'))

dev_ts = int(dev_split.strftime("%s"))
test_ts = int(test_split.strftime("%s"))

print('Dev: ', dev_split, dev_ts)
print('Test:', test_split, test_ts)

Dev:  2016-11-14 00:00:00-05:00 1479099600
Test: 2016-11-21 00:00:00-05:00 1479704400


In [7]:
from os import listdir
from os.path import isfile, join

data_path = 'per_station'
station_files = sorted([f for f in listdir(data_path) if isfile(join(data_path, f))])
station_files = list(filter(lambda f: 'swp' not in f, station_files))

print('# files:', len(station_files))
station_files[:10]

# files: 50


['128.csv',
 '151.csv',
 '161.csv',
 '174.csv',
 '229.csv',
 '284.csv',
 '285.csv',
 '297.csv',
 '3002.csv',
 '3141.csv']

In [12]:
for station_file in station_files:
    t_frame = pd.read_csv('per_station/' + station_file)
    t_train = t_frame['last_reported'] < dev_ts
    t_dev = (t_frame['last_reported'] >= dev_ts) & (t_frame['last_reported'] < test_ts)
    t_test = t_frame['last_reported'] >= test_ts
    
    assert sum(t_train) + sum(t_dev) + sum(t_test) == len(t_frame)
    
    train_frame = t_frame[t_train]
    dev_frame = t_frame[t_dev]
    test_frame = t_frame[t_test]
    
    if len(train_frame) == 0 or len(dev_frame) == 0 or len(test_frame) == 0:
        print(station_file, len(train_frame), len(dev_frame), len(test_frame), 'Zero!')
        continue
        
    print(station_file, len(train_frame), len(dev_frame), len(test_frame))
    
    train_frame.to_csv('per_station_train/' + station_file)
    dev_frame.to_csv('per_station_dev/' + station_file)
    test_frame.to_csv('per_station_test/' + station_file)

128.csv 4476 1056 932
151.csv 4892 1136 1072
161.csv 4517 1056 964
174.csv 4386 1046 849
229.csv 4606 1053 1027
284.csv 4923 1136 1147
285.csv 5244 1277 1310
297.csv 4508 1075 959
3002.csv 4333 1031 915
3141.csv 4409 1078 1043
320.csv 5894 1402 1531
3255.csv 4948 1207 1065
3263.csv 5808 1385 1344
368.csv 4745 1167 1132
379.csv 4469 1095 981
382.csv 4548 1065 973
383.csv 4365 975 868
402.csv 4781 1089 1122
426.csv 4614 1035 1000
432.csv 4537 1082 976
435.csv 5130 1267 1278
442.csv 4385 1009 939
444.csv 4439 1009 1004
446.csv 4696 1104 1081
450.csv 4298 1008 938
457.csv 4309 977 1045
459.csv 4232 0 890 Zero!
461.csv 4440 1024 925
470.csv 4498 1047 1042
472.csv 4445 1006 954
477.csv 5041 1224 1090
479.csv 4414 1082 991
482.csv 4469 1017 1018
483.csv 4635 1200 1006
490.csv 4966 1163 1135
491.csv 4655 1152 1032
492.csv 4454 1143 1056
494.csv 4494 1072 989
497.csv 4928 1218 1194
504.csv 4593 1164 1103
505.csv 4454 1067 824
507.csv 4989 1228 1212
509.csv 4431 1234 1213
511.csv 4359 1045 911
5