In [29]:
import pandas as pd
import numpy as np

# h is the number of days before day (t)
# r indicates how many days after day (t) --> target-day = day(t+r)
# target could be number of deaths or number of confirmed 
def makeHistoricalData(h, r, target):
        ''' in this code when h is 1, it means there is no history and we have just one column for each covariate
        so when h is 0, we put h equal to 1, because when h is 0 that means there no history (as when h is 1) '''
        if h == 0:
                h = 1
        
        independantOfTimeData = pd.read_csv('fixed-data.csv')
        timeDeapandantData = pd.read_csv('new-temporal-data.csv')

        allData = pd.merge(independantOfTimeData, timeDeapandantData, on='county_fips')
        allData = allData.sort_values(by=['date', 'county_fips'])
        allData = allData.reset_index(drop=True)
        # this columns are not numercal and wouldn't be included in correlation matrix, we store them to concatenate them later
        notNumericlData = allData[['county_name', 'state_name', 'date']]

        # next 2 lines arranges columns in order of correlations with target
        ix = allData.corr().abs().sort_values(target, ascending=False).index
        allData = allData.loc[:, ix]
        allData = pd.concat([allData, notNumericlData], axis=1)

        nameOfTimeDependantCovariates = timeDeapandantData.columns.values.tolist()
        nameOfAllCovariates = allData.columns.values.tolist()

        result = pd.DataFrame()  # we store historical data in this dataframe
        totalNumberOfCounties = len(allData['county_fips'].unique())
        totalNumberOfDays = len(allData['date'].unique())

        # in this loop we make historical data
        for name in nameOfAllCovariates:
                # if covariate is time dependant
                if name in nameOfTimeDependantCovariates and name not in ['date', 'county_fips']:
                        temporalDataFrame = allData[[name]] # selecting column of the covariate that is being processed
                        threshold = 0
                        while threshold != h:
                                # get value of covariate that is being processed in first (totalNumberOfDays-h-r+1) days
                                temp = temporalDataFrame.head((totalNumberOfDays-h-r+1)*totalNumberOfCounties).copy().reset_index(drop=True)
                                temp.rename(columns={name: (name + ' t-' + str(h-threshold-1))}, inplace=True) # renaming column
                                result = pd.concat([result, temp], axis=1)
                                # deleting the values in first day in temporalDataFrame dataframe (similiar to shift)
                                temporalDataFrame = temporalDataFrame.iloc[totalNumberOfCounties:]
                                threshold += 1
                # if covariate is independant of time
                elif name not in nameOfTimeDependantCovariates and name not in ['date', 'county_fips']:
                        temporalDataFrame = allData[[name]]
                        temp = temporalDataFrame.head((totalNumberOfDays-h-r+1)*totalNumberOfCounties).copy().reset_index(drop=True)
                        result = pd.concat([result, temp], axis=1)

        # next 3 lines is for adding FIPS code to final dataframe
        temporalDataFrame = allData[['county_fips']]
        temp = temporalDataFrame.head((totalNumberOfDays-h-r+1)*totalNumberOfCounties).copy().reset_index(drop=True)
        result.insert(0, 'county_fips', temp)

        # next 3 lines is for adding date of day (t) to final dataframe
        temporalDataFrame = allData[['date']]
        temporalDataFrame = temporalDataFrame[totalNumberOfCounties*(h-1):]
        temp = temporalDataFrame.head((totalNumberOfDays-h-r+1)*totalNumberOfCounties).copy().reset_index(drop=True)
        result.insert(1, 'date of day t', temp)

        # next 3 lines is for adding target to final dataframe
        temporalDataFrame = allData[[target]]
        temporalDataFrame = temporalDataFrame.tail((totalNumberOfDays-h-r+1)*totalNumberOfCounties).reset_index(drop=True)
        result.insert(1, 'Target', temporalDataFrame)
        for i in result.columns:
            if i.endswith('t-0'):
                result.rename(columns={i: i[:-2]}, inplace=True)

        return result


def main():
        h = 0
        r = 14
        target = 'confirmed'
        #result = makeHistoricalData(h, r, target)
        # Storing the result in a csv file
        #result.to_csv('dataset_h=' + str(h) + '.csv', mode='w', index=False)


if __name__ == "__main__":
        main()

In [30]:
#from makeHistoricalData import makeHistoricalData
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import time
from sys import argv
from math import floor
import os




######################################################### split data to train, val, test
def splitData(numberOfCounties, main_data, target, offset, j_offset):

    X = pd.DataFrame()
    y = pd.DataFrame()
    for i in range(numberOfCounties + 1):
        j = i * numberOfDays + j_offset
        X = X.append(main_data.loc[j:j + offset - 1])
        y = y.append(target.loc[j:j + offset - 1])

    return X, y


########################################################### clean data
def clean_data(data, numberOfSelectedCounties):

    global numberOfDays
    data = data.sort_values(by=['county_fips', 'date of day t'])
    # select the number of counties we want to use
    #numberOfSelectedCounties = numberOfCounties
    if numberOfSelectedCounties == -1:
        numberOfSelectedCounties = len(data['county_fips'].unique())

    using_data = data[(data['county_fips'] <= data['county_fips'].unique()[numberOfSelectedCounties - 1])]
    using_data = using_data.reset_index(drop=True)
    main_data = using_data.drop(['county_fips', 'state_fips', 'state_name', 'county_name', 'date of day t'],
                                axis=1)
    # target = pd.DataFrame(main_data['Target'])
    # main_data = main_data.drop(['Target'], axis=1)
    # numberOfCounties = len(using_data['county_fips'].unique())
    numberOfDays = len(using_data['date of day t'].unique())

    return main_data


########################################################### preprocess
def preprocess(main_data, validationFlag):

    target = pd.DataFrame(main_data['Target'])
    main_data = main_data.drop(['Target'], axis=1)
    # specify the size of train, validation and test sets
    test_offset = 14
    train_offset = floor(0.75 * (numberOfDays - test_offset))
    val_offset = numberOfDays - (train_offset + test_offset)
    t1 = time.time()


    if validationFlag:     # validationFlag is 1 if we want to have a validation set and 0 otherwise
        # add the functions to the multiprocessing object, loom
        X_train_train,y_train_train = splitData(numberOfSelectedCounties, main_data, target, train_offset, 0)
        X_train_train = X_train_train.reset_index(drop=True)
        y_train_train = np.array(y_train_train).reshape(-1)
        X_train_val,y_train_val = np.array(splitData(numberOfSelectedCounties, main_data, target, val_offset, train_offset))
        X_train_val = X_train_val.reset_index(drop=True)
        y_train_val = np.array(y_train_val).reshape(-1)
        X_test,y_test = np.array(splitData(numberOfSelectedCounties, main_data, target, test_offset, train_offset + val_offset))
        X_test = X_test.reset_index(drop=True)
        y_test = np.array(y_test).reshape(-1)

        t2 = time.time()
        #print('total time of data splitting: ', t2 - t1)

        return X_train_train, X_train_val, X_test, y_train_train, y_train_val, y_test

    else:
        X_train ,y_train = splitData(numberOfSelectedCounties, main_data, target, train_offset + val_offset, 0)
        X_train = X_train.reset_index(drop=True)
        y_train = np.array(y_train).reshape(-1)
        X_test ,y_test = splitData(numberOfSelectedCounties, main_data, target, test_offset, train_offset + val_offset)
        X_test = X_test.reset_index(drop=True)
        y_test = np.array(y_test).reshape(-1)
        
        t2 = time.time()
        #print('total time of data splitting: ', t2 - t1)

        return X_train, X_test, y_train, y_test


In [31]:
h = 1
r = 14
target = 'confirmed'
numberOfSelectedCounties = 500  # set to -1 for all the counties

data = makeHistoricalData(h, r, target)
data = clean_data(data, numberOfSelectedCounties)

X_train_train, X_train_val, X_test, y_train_train, y_train_val, y_test = preprocess(data, 1) # with validation

X_train, X_test, y_train, y_test = preprocess(data, 0) # without validation