In [1]:
#timeSeriesDataColumns = sorted([c for c in rawDF.columns if c not in ["fips", "date", "score", "T2M", "T2MDEW", "T2M_MIN", "TS", "WS10M", "WS10M_MAX", "WS50M", "WS50M_MAX", "WS50M_MIN", "WS50M_RANGE"]])

# Loading the unprocessed dataset into Pandas dataframes.

In [2]:
import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.style.use('seaborn-poster')

from scipy.interpolate import interp1d
from sklearn.preprocessing import RobustScaler

In [3]:
datasetsFolder = {}

for dirname, _, filenames in os.walk(r'R:\Ryerson\Misc\Datasets\Predict Droughts using Weather & Soil Data\\'):
    for filename in filenames:
        if 'train' in filename:
            datasetsFolder['train'] = os.path.join(dirname, filename)
        if 'valid' in filename:
            datasetsFolder['valid'] = os.path.join(dirname, filename)
        if 'test' in filename:
            datasetsFolder['test'] = os.path.join(dirname, filename)

In [4]:
allDatasets = { k: pd.read_csv(datasetsFolder[k]).set_index(['fips', 'date'])
    for k in datasetsFolder.keys()}

# Pre-processing data to create feature and response matrices.

In [5]:
def interpolate_NaN_values(sourceArray, interpolationKind='linear'):
    allIndexes = np.arange(sourceArray.shape[0])
    allGoodIndexes, = np.where(np.isfinite(sourceArray))
    f = interp1d(allGoodIndexes,
                 sourceArray[allGoodIndexes],
                 bounds_error=False,
                 copy=False,
                 fill_value='extrapolate',
                 kind=interpolationKind)
    return f(allIndexes)

In [6]:
def loadXYMatrices(
    df,
    randomState = 42, # random state is kept at 42 as per convention
    windowSize = 30, # decides number of days per each output sample for which the corresponding drought score is returned
    targetSize = 1, # decides how many weeks' worth of drought scores are returned starting the first day of the week following windowSize 
    ):
        rawDF = allDatasets[df]

        soilDF = pd.read_csv(r'R:\Ryerson\Misc\Datasets\Predict Droughts using Weather & Soil Data\soil_data.csv')

        timeSeriesDataColumns = sorted([c for c in rawDF.columns if c not in ["fips", "date", "score", "T2M", "T2MDEW", "T2M_MIN", "TS", "WS10M", "WS10M_MAX", "WS50M", "WS50M_MAX", "WS50M_MIN", "WS50M_RANGE"]])
        #timeSeriesDataColumns are the 18 meteorological indicators minus the 10 indicators that have been deemed redundant based on the Correlation Heatmap generated in file 'Investigate_if_Scaling_Required_or_Not_+_Feature_Selection_Required_or_Not'
        print('\ntimeSeriesDataColumns: \n')
        for i_1 in timeSeriesDataColumns:
            print(i_1, '\n')

        #staticDataColumns are the 29 soil data indicators
        staticDataColumns = sorted([c for c in soilDF.columns if c not in ["fips", "lat", "lon"]])
        '''print('\nstaticDataColumns: ')
        for i_2 in staticDataColumns:
            print(i_2, '\n')'''

        count = 0
        scoreDF = rawDF.dropna(subset=["score"])

        X_static = np.empty((len(rawDF) // windowSize, len(staticDataColumns)))
        # the shape of this uninitialized array will be (19300680/windowSize, 28)
        '''print('X_static Shape: ', X_static.shape)'''

        X_time = np.empty((len(rawDF) // windowSize, windowSize, len(timeSeriesDataColumns))) 
        # the shape of this uninitialized array will be (19300680/windowSize, windowSize, 8)
        print('X_time Shape: ', X_time.shape)

        y_past = np.empty((len(rawDF) // windowSize, windowSize))
        # the shape of this uninitialized array will be (19300680/windowSize, windowSize)
        print('y_past Shape: ', y_past.shape)

        y_target = np.empty((len(rawDF) // windowSize, targetSize))
        # the shape of this uninitialized array will be (19300680/windowSize, targetSize)
        print('y_target Shape: ', y_target.shape)

        if randomState is not None:
            np.random.seed(randomState)
            
        for uniqueFIPScode in tqdm(scoreDF.index.get_level_values(0).unique()): #for every unique FIPS county code
            
            startingPoint = 1
            
            fipsDF = rawDF[(rawDF.index.get_level_values(0) == uniqueFIPScode)] #store the df sample at [index = current unique value of fips] 
            X = fipsDF[timeSeriesDataColumns].values #individual X = current sample values of the 8 meteorological columns 
            y = fipsDF["score"].values #individual y = current samples' values of the column 'score' as ndarray
            XStat = soilDF[soilDF["fips"] == uniqueFIPScode][staticDataColumns].values[0] #individual soil data sample = return as ndarray all the 28 column values minus the axis labels
            
            for i in range(startingPoint, len(y) - (windowSize + targetSize * 7), windowSize):
                X_time[count, :, : len(timeSeriesDataColumns)] = X[i : i + windowSize]
                y_past[count] = interpolate_NaN_values(y[i : i + windowSize])
                tempY = y[i + windowSize : i + windowSize + targetSize * 7]
                y_target[count] = np.array(tempY[~np.isnan(tempY)][:targetSize])
                X_static[count] = XStat
                count += 1
        
        print(f"\n\n-----------------------------------\nLoaded {count} samples successfully.\n-----------------------------------\n\n")
        matrices = [X_time[:count], y_target[:count]]
        
        #If you wish to inculcate Soil data in the training of your model and require the Soil data array for the same then uncomment the following line of code: 
        #matrices.append(X_static[:count])
        
        #If you require the interpolated past drought values going back the duration of the window size then uncomment the following line of code: 
        matrices.append(y_past[:count])
        
        return matrices

# Defining function to scale features using RobustScaler.

In [7]:
scalerDict = {}

def scaleFeatures(sourceArray, fit=False):
    for feature in tqdm(range(sourceArray.shape[-1])): #printing a progress bar for each of the meteorological indicators
        if fit:
            scalerDict[feature] = RobustScaler().fit(sourceArray[:, feature].reshape(-1, 1))
        sourceArray[:, feature] = scalerDict[feature].transform(sourceArray[:, feature].reshape(-1, 1)).reshape(1, -1)
    return sourceArray

# Creating the Feature and Response matrices.

In [8]:
# There are 3108 counties in the training dataset.

# For each county, there are 6210 observations i.e. there is one observation for every day of 2001-2017. 

# Meaning there are a total of 3108 x 6210 = 19,300,680 observations in the training dataset.

In [None]:
X_train_unscaled, y_target_train, y_past_train = loadXYMatrices("train")
X_valid_unscaled, y_target_valid, y_past_valid = loadXYMatrices("valid")
X_test_unscaled, y_target_test, y_past_test = loadXYMatrices("test")


timeSeriesDataColumns: 

PRECTOT 

PS 

QV2M 

T2MWET 

T2M_MAX 

T2M_RANGE 

WS10M_MIN 

WS10M_RANGE 

X_time Shape:  (643356, 30, 8)
y_past Shape:  (643356, 30)
y_target Shape:  (643356, 1)


  0%|          | 0/3108 [00:00<?, ?it/s]

In [None]:
print('\nX_train_unscaled.shape: ', X_train_unscaled.shape)
print('\ny_target_train.shape: ', y_target_train.shape)
print('\ny_past_train.shape: ', y_past_train.shape)


print('\n\n\n')

print('\nX_valid_unscaled.shape: ', X_valid_unscaled.shape)
print('\ny_target_valid.shape: ', y_target_valid.shape)
print('\ny_past_valid.shape: ', y_past_valid.shape)

print('\n\n\n')

print('\nX_test_unscaled.shape: ', X_test_unscaled.shape)
print('\ny_target_test.shape: ', y_target_test.shape)
print('\ny_past_test.shape: ', y_past_test.shape)

In [None]:
X_train = np.empty((X_train_unscaled.shape[0], X_train_unscaled.shape[-1]))
i = 0
for window in X_train_unscaled:
    X_train[i] = np.mean(window, axis=0)
    i += 1
print(i)

In [None]:
X_train.shape

count = 0
for k in X_train:
    print(k, '\n\n')
    count += 1
    if count == 1:
        break

In [None]:
X_valid = np.empty((X_valid_unscaled.shape[0], X_valid_unscaled.shape[-1]))
i = 0
for window in X_valid_unscaled:
    X_valid[i] = np.mean(window, axis=0)
    i += 1
print(i)

In [None]:
X_valid.shape

count = 0
for k in X_valid:
    print(k, '\n\n')
    count += 1
    if count == 1:
        break

In [None]:
X_test = np.empty((X_test_unscaled.shape[0], X_test_unscaled.shape[-1]))
i = 0
for window in X_test_unscaled:
    X_test[i] = np.mean(window, axis=0)
    i += 1
print(i)

In [None]:
X_test.shape

count = 0
for k in X_test:
    print(k, '\n\n')
    count += 1
    if count == 1:
        break

# Scaling Feature values

In [None]:
X_train = scaleFeatures(X_train, fit=True)
X_valid = scaleFeatures(X_valid)
X_test = scaleFeatures(X_test)

In [None]:
count = 0
for k in X_train:
    print(k)
    #for j in k:
    #    print(j)
    count += 1
    print('\n\n\n')
    if count == 30:
        break

In [None]:
count = 0
for k in y_target_train:
    print(k, '\n')
    count += 1
    if count == 10:
        break

# Fusing Past Drought values

## For X_train

In [None]:
y_past_train_trimmed = np.delete(y_past_train, np.s_[1::], 1)

In [None]:
y_past_train_trimmed.shape

In [None]:
X_train_concatenatedWithPastDroughtValues = np.concatenate((X_train, y_past_train_trimmed), axis=1)

In [None]:
X_train_concatenatedWithPastDroughtValues.shape

## For X_valid

In [None]:
y_past_valid_trimmed = np.delete(y_past_valid, np.s_[1::], 1)

In [None]:
y_past_valid_trimmed.shape

In [None]:
X_valid_concatenatedWithPastDroughtValues = np.concatenate((X_valid, y_past_valid_trimmed), axis=1)

In [None]:
X_valid_concatenatedWithPastDroughtValues.shape

## For X_test

In [None]:
y_past_test_trimmed = np.delete(y_past_test, np.s_[1::], 1)

In [None]:
y_past_test_trimmed.shape

In [None]:
X_test_concatenatedWithPastDroughtValues = np.concatenate((X_test, y_past_test_trimmed), axis=1)

In [None]:
X_test_concatenatedWithPastDroughtValues.shape

# Saving Feature and Response matrices for Training, Validation and Testing datasets respectively to file for future ease of use.

In [None]:
'''#30-Day Window + Scaled

# Training Files

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\X_train_30_day_window_scaled_+_pastDroughtValues_+_FeatureSelection.csv', X_train_concatenatedWithPastDroughtValues, delimiter=',', newline='\n')

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\y_target_train_30_day_window.csv', y_target_train, delimiter=',', newline='\n')



# Validation Files

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\X_valid_30_day_window_scaled_+_pastDroughtValues_+_FeatureSelection.csv', X_valid_concatenatedWithPastDroughtValues, delimiter=',', newline='\n')

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\y_target_valid_30_day_window.csv', y_target_valid, delimiter=',', newline='\n')



# Testing Files

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\X_test_30_day_window_scaled_+_pastDroughtValues_+_FeatureSelection.csv', X_test_concatenatedWithPastDroughtValues, delimiter=',', newline='\n')

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\y_target_test_30_day_window.csv', y_target_test, delimiter=',', newline='\n')'''


In [None]:
#30-Day Window + Scaled

# Training Files

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\X_train_30_day_window_scaled_+_FeatureSelection.csv', X_train, delimiter=',', newline='\n')

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\y_target_train_30_day_window.csv', y_target_train, delimiter=',', newline='\n')



# Validation Files

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\X_valid_30_day_window_scaled_+_FeatureSelection.csv', X_valid, delimiter=',', newline='\n')

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\y_target_valid_30_day_window.csv', y_target_valid, delimiter=',', newline='\n')



# Testing Files

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\X_test_30_day_window_scaled_+_FeatureSelection.csv', X_test, delimiter=',', newline='\n')

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\y_target_test_30_day_window.csv', y_target_test, delimiter=',', newline='\n')
