In [37]:
from makeHistoricalData import makeHistoricalData
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import time
from sys import argv
from math import floor
import os




######################################################### split data to train, val, test
def splitData(numberOfCounties, main_data, target, offset, j_offset):

    X = pd.DataFrame()
    y = pd.DataFrame()
    for i in range(numberOfCounties + 1):
        j = i * numberOfDays + j_offset
        X = X.append(main_data.loc[j:j + offset - 1])
        y = y.append(target.loc[j:j + offset - 1])

    return X, y


########################################################### clean data
def clean_data(data, numberOfSelectedCounties):

    global numberOfDays
    data = data.sort_values(by=['county_fips', 'date of day t'])
    # select the number of counties we want to use
    #numberOfSelectedCounties = numberOfCounties
    if numberOfSelectedCounties == -1:
        numberOfSelectedCounties = len(data['county_fips'].unique())

    using_data = data[(data['county_fips'] <= data['county_fips'].unique()[numberOfSelectedCounties - 1])]
    using_data = using_data.reset_index(drop=True)
    main_data = using_data.drop(['county_fips', 'state_fips', 'state_name', 'county_name', 'date of day t'],
                                axis=1)
    # target = pd.DataFrame(main_data['Target'])
    # main_data = main_data.drop(['Target'], axis=1)
    # numberOfCounties = len(using_data['county_fips'].unique())
    numberOfDays = len(using_data['date of day t'].unique())

    return main_data


########################################################### preprocess
def preprocess(main_data, validationFlag):

    target = pd.DataFrame(main_data['Target'])
    main_data = main_data.drop(['Target'], axis=1)
    # specify the size of train, validation and test sets
    test_offset = 14
    train_offset = floor(0.75 * (numberOfDays - test_offset))
    val_offset = numberOfDays - (train_offset + test_offset)
    t1 = time.time()


    if validationFlag:     # validationFlag is 1 if we want to have a validation set and 0 otherwise
        # add the functions to the multiprocessing object, loom
        X_train_train,y_train_train = splitData(numberOfSelectedCounties, main_data, target, train_offset, 0)
        X_train_train = X_train_train.reset_index(drop=True)

        X_train_val,y_train_val = np.array(splitData(numberOfSelectedCounties, main_data, target, val_offset, train_offset))
        X_train_val = X_train_val.reset_index(drop=True)

        X_test,y_test = np.array(splitData(numberOfSelectedCounties, main_data, target, test_offset, train_offset + val_offset))
        X_test = X_test.reset_index(drop=True)


        t2 = time.time()
        #print('total time of data splitting: ', t2 - t1)

        return X_train_train, X_train_val, X_test, y_train_train, y_train_val, y_test

    else:
        X_train ,y_train = splitData(numberOfSelectedCounties, main_data, target, train_offset + val_offset, 0)
        X_train = X_train.reset_index(drop=True)

        X_test ,y_test = splitData(numberOfSelectedCounties, main_data, target, test_offset, train_offset + val_offset)
        X_test = X_test.reset_index(drop=True)

        
        t2 = time.time()
        #print('total time of data splitting: ', t2 - t1)

        return X_train, X_test, y_train, y_test


In [38]:
h = 1
r = 14
target = 'confirmed'
numberOfSelectedCounties = 500  # set to -1 for all the counties

data = makeHistoricalData(h, r, target)
data = clean_data(data, numberOfSelectedCounties)

################################### with validation

X_train_train, X_train_val, X_test, y_train_train, y_train_val, y_test = preprocess(data, 1) # with validation

# save to csv
train_train=pd.concat([X_train_train.reset_index(drop=True),y_train_train.reset_index(drop=True)],axis=1)
train_train.to_csv('train_train.csv',index=False)
train_val=pd.concat([X_train_val.reset_index(drop=True),y_train_val.reset_index(drop=True)],axis=1)
train_val.to_csv('train_val.csv',index=False)
test=pd.concat([X_test.reset_index(drop=True),y_test.reset_index(drop=True)],axis=1)
test.to_csv('test.csv',index=False)

################################# without validation

# X_train, X_test, y_train, y_test = preprocess(data, 0) 

# save to csv
# train=pd.concat([X_train.reset_index(drop=True),y_train.reset_index(drop=True)],axis=1)
# train.to_csv('train.csv',index=False)
# test=pd.concat([X_test.reset_index(drop=True),y_test.reset_index(drop=True)],axis=1)
# test.to_csv('test.csv',index=False)