# Loading the unprocessed dataset into Pandas dataframes.

In [1]:
import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from scipy.interpolate import interp1d
from sklearn.preprocessing import RobustScaler

datasetsFolder = {}

for dirname, _, filenames in os.walk(r'R:\Ryerson\Misc\Datasets\Predict Droughts using Weather & Soil Data\\'):
    for filename in filenames:
        if 'train' in filename:
            datasetsFolder['train'] = os.path.join(dirname, filename)
        if 'valid' in filename:
            datasetsFolder['valid'] = os.path.join(dirname, filename)
        if 'test' in filename:
            datasetsFolder['test'] = os.path.join(dirname, filename)

In [2]:
allDatasets = { k: pd.read_csv(datasetsFolder[k]).set_index(['fips', 'date'])
    for k in datasetsFolder.keys()}

# Pre-processing data to create feature and response matrices.

In [3]:
def interpolate_NaN_values(sourceArray, interpolationKind='linear'):
    allIndexes = np.arange(sourceArray.shape[0])
    allGoodIndexes, = np.where(np.isfinite(sourceArray))
    f = interp1d(allGoodIndexes,
                 sourceArray[allGoodIndexes],
                 bounds_error=False,
                 copy=False,
                 fill_value='extrapolate',
                 kind=interpolationKind)
    return f(allIndexes)

In [4]:
def loadXYMatrices(
    df,
    randomState = 42, # random state is kept at 42 as per convention
    windowSize = 90, # decides number of days per each output sample for which the corresponding drought score is returned
    targetSize = 1, # decides how many weeks' worth of drought scores are returned starting the first day of the week following windowSize 
    ):
        rawDF = allDatasets[df]

        soilDF = pd.read_csv(r'R:\Ryerson\Misc\Datasets\Predict Droughts using Weather & Soil Data\soil_data.csv')

        timeSeriesDataColumns = sorted([c for c in rawDF.columns if c not in ["fips", "date", "score"]])
        #timeSeriesDataColumns are the 18 meteorological indicators
        print('\ntimeSeriesDataColumns: \n')
        for i_1 in timeSeriesDataColumns:
            print(i_1, '\n')

        #staticDataColumns are the 29 soil data indicators
        staticDataColumns = sorted([c for c in soilDF.columns if c not in ["fips", "lat", "lon"]])
        '''print('\nstaticDataColumns: ')
        for i_2 in staticDataColumns:
            print(i_2, '\n')'''

        count = 0
        scoreDF = rawDF.dropna(subset=["score"])

        X_static = np.empty((len(rawDF) // windowSize, len(staticDataColumns)))
        # the shape of this uninitialized array will be (19300680/windowSize, 28)
        '''print('X_static Shape: ', X_static.shape)'''

        X_time = np.empty((len(rawDF) // windowSize, windowSize, len(timeSeriesDataColumns))) 
        # the shape of this uninitialized array will be (19300680/windowSize, windowSize, 18)
        print('X_time Shape: ', X_time.shape)

        y_past = np.empty((len(rawDF) // windowSize, windowSize))
        # the shape of this uninitialized array will be (19300680/windowSize, windowSize)
        '''print('y_past Shape: ', y_past.shape)'''

        y_target = np.empty((len(rawDF) // windowSize, targetSize))
        # the shape of this uninitialized array will be (19300680/windowSize, targetSize)
        print('y_target Shape: ', y_target.shape)

        if randomState is not None:
            np.random.seed(randomState)
            
        for uniqueFIPScode in tqdm(scoreDF.index.get_level_values(0).unique()): #for every unique FIPS county code
            
            if randomState is not None:
                startingPoint = np.random.randint(1, windowSize) #return random integers from 1 to 30 (30 not included)
            else:
                startingPoint = 1
            
            fipsDF = rawDF[(rawDF.index.get_level_values(0) == uniqueFIPScode)] #store the df sample at [index = current unique value of fips] 
            X = fipsDF[timeSeriesDataColumns].values #individual X = current sample values of the 18 meteorological columns 
            y = fipsDF["score"].values #individual y = current samples' values of the column 'score' as ndarray
            XStat = soilDF[soilDF["fips"] == uniqueFIPScode][staticDataColumns].values[0] #individual soil data sample = return as ndarray all the 29 column values minus the axis labels
            
            for i in range(startingPoint, len(y) - (windowSize + targetSize * 7), windowSize):
                X_time[count, :, : len(timeSeriesDataColumns)] = X[i : i + windowSize]
                y_past[count] = interpolate_NaN_values(y[i : i + windowSize])
                tempY = y[i + windowSize : i + windowSize + targetSize * 7]
                y_target[count] = np.array(tempY[~np.isnan(tempY)][:targetSize])
                X_static[count] = XStat
                count += 1
        
        print(f"\n\n-----------------------------------\nLoaded {count} samples successfully.\n-----------------------------------\n\n")
        matrices = [X_time[:count], y_target[:count]]
        #If you wish to inculcate Soil data in the training of your model and require the Soil data array for the same then uncomment the following line of code: 
        #matrices.append(X_static[:count])
        #If you require the interpolated past drought values for every day in the dataset then uncomment the following line of code: 
        #matrices.append(y_past[:count])
        return matrices

# Defining function to scale features using RobustScaler.

In [5]:
scalerDict = {}

def scaleFeatures(sourceArray, fit=False):
    for index in tqdm(range(sourceArray.shape[-1])): #printing a progress bar for each of the 18 meteorological indicators
        if fit:
            scalerDict[index] = RobustScaler().fit(sourceArray[:, :, index].reshape(-1, 1))
        sourceArray[:, :, index] = (scalerDict[index].transform(sourceArray[:, :, index].reshape(-1, 1)).reshape(-1, sourceArray.shape[-2]))
    return sourceArray

# Creating the Feature and Response matrices.

In [6]:
# There are 3108 counties in the training dataset.

# For each county, there are 6210 observations i.e. there is one observation for every day of 2001-2017. 

# Meaning there are a total of 3108 x 6210 = 19,300,680 observations in the training dataset.

In [7]:
X_train, y_target_train = loadXYMatrices("train")
X_valid, y_target_valid = loadXYMatrices("valid")
X_test, y_target_test = loadXYMatrices("test")

#Normalizing values:
X_train = scaleFeatures(X_train, fit=True)
X_valid = scaleFeatures(X_valid)
X_test = scaleFeatures(X_test)


timeSeriesDataColumns: 

PRECTOT 

PS 

QV2M 

T2M 

T2MDEW 

T2MWET 

T2M_MAX 

T2M_MIN 

T2M_RANGE 

TS 

WS10M 

WS10M_MAX 

WS10M_MIN 

WS10M_RANGE 

WS50M 

WS50M_MAX 

WS50M_MIN 

WS50M_RANGE 

X_time Shape:  (214452, 90, 18)
y_target Shape:  (214452, 1)


  0%|          | 0/3108 [00:00<?, ?it/s]



-----------------------------------
Loaded 211099 samples successfully.
-----------------------------------



timeSeriesDataColumns: 

PRECTOT 

PS 

QV2M 

T2M 

T2MDEW 

T2MWET 

T2M_MAX 

T2M_MIN 

T2M_RANGE 

TS 

WS10M 

WS10M_MAX 

WS10M_MIN 

WS10M_RANGE 

WS50M 

WS50M_MAX 

WS50M_MIN 

WS50M_RANGE 

X_time Shape:  (25209, 90, 18)
y_target Shape:  (25209, 1)


  0%|          | 0/3108 [00:00<?, ?it/s]



-----------------------------------
Loaded 21839 samples successfully.
-----------------------------------



timeSeriesDataColumns: 

PRECTOT 

PS 

QV2M 

T2M 

T2MDEW 

T2MWET 

T2M_MAX 

T2M_MIN 

T2M_RANGE 

TS 

WS10M 

WS10M_MAX 

WS10M_MIN 

WS10M_RANGE 

WS50M 

WS50M_MAX 

WS50M_MIN 

WS50M_RANGE 

X_time Shape:  (25243, 90, 18)
y_target Shape:  (25243, 1)


  0%|          | 0/3108 [00:00<?, ?it/s]



-----------------------------------
Loaded 21881 samples successfully.
-----------------------------------




  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

In [8]:
print('\nX_train.shape: ', X_train.shape)
print('\ny_target_train.shape: ', y_target_train.shape)

print('\n\n\n')

print('\nX_valid.shape: ', X_valid.shape)
print('\ny_target_valid.shape: ', y_target_valid.shape)

print('\n\n\n')

print('\nX_test.shape: ', X_test.shape)
print('\ny_target_test.shape: ', y_target_test.shape)



X_train.shape:  (211099, 90, 18)

y_target_train.shape:  (211099, 1)





X_valid.shape:  (21839, 90, 18)

y_target_valid.shape:  (21839, 1)





X_test.shape:  (21881, 90, 18)

y_target_test.shape:  (21881, 1)


In [9]:
X_train_averaged = np.empty((X_train.shape[0], X_train.shape[-1]))
i = 0
for window in X_train:
    X_train_averaged[i] = np.mean(window, axis=0)
    i += 1
print(i)

211099


In [10]:
X_train_averaged.shape

count = 0
for k in X_train_averaged:
    print(k, '\n\n')
    count += 1
    if count == 1:
        break

(211099, 18)

[ 1.33608563  0.47864078  0.26165205  0.15968382  0.23115031  0.23127497
  0.19923766  0.11435653  0.47204554  0.14949609 -0.31955736 -0.35677992
 -0.20740741 -0.30873342 -0.16587302 -0.21788977 -0.06900188 -0.13193916] 




In [11]:
X_valid_averaged = np.empty((X_valid.shape[0], X_valid.shape[-1]))
i = 0
for window in X_valid:
    X_valid_averaged[i] = np.mean(window, axis=0)
    i += 1
print(i)

21839


In [12]:
X_valid_averaged.shape

count = 0
for k in X_valid_averaged:
    print(k, '\n\n')
    count += 1
    if count == 1:
        break

(21839, 18)

[ 1.78353721  0.47783172  0.41049708  0.19097542  0.32567586  0.32600718
  0.18630964  0.1908454   0.20510404  0.17612749 -0.34426378 -0.40116315
 -0.20137174 -0.378738   -0.18993506 -0.2418214  -0.05984934 -0.17667934] 




In [13]:
X_test_averaged = np.empty((X_test.shape[0], X_test.shape[-1]))
i = 0
for window in X_test:
    X_test_averaged[i] = np.mean(window, axis=0)
    i += 1
print(i)

21881


In [14]:
X_test_averaged.shape

count = 0
for k in X_test_averaged:
    print(k, '\n\n')
    count += 1
    if count == 1:
        break

(21881, 18)

[ 1.62477064  0.47918015  0.39273392  0.14125704  0.31647067  0.30780689
  0.12392478  0.15945535  0.10547703  0.12952885 -0.37913279 -0.40951944
 -0.24993141 -0.35861911 -0.23787879 -0.28408488 -0.1320904  -0.15589354] 




In [15]:
count = 0
for k in X_train:
    #print(k)
    for j in k:
        print(j)
    count += 1
    print('\n\n\n')
    if count == 1:
        break

[ 0.08256881  0.73786408 -0.24868421 -0.24135945 -0.22749392 -0.22790981
 -0.11492281 -0.34027365  0.85159011 -0.23639456 -0.34146341 -0.50137741
  0.06790123 -0.70781893 -0.0487013  -0.04244032 -0.03389831  0.07984791]
[-0.08256881  0.68446602  0.         -0.04435484  0.03710462  0.03717246
  0.0485992  -0.09577632  0.62897527 -0.04365079 -0.04471545 -0.12396694
  0.32098765 -0.3127572   0.23051948 -0.15384615  0.52881356 -0.71102662]
[-0.08256881  0.65291262  0.21710526  0.06509217  0.24391727  0.24375381
  0.16923957  0.02201071  0.65547703  0.06122449 -0.28861789 -0.30853994
  0.01851852 -0.38683128 -0.00649351 -0.21750663  0.12881356 -0.35361217]
[-0.08256881  0.59466019  0.26315789  0.11059908  0.28223844  0.28214503
  0.20468839  0.03926234  0.71024735  0.1031746  -0.17073171 -0.11570248
  0.04938272 -0.11934156  0.17857143 -0.12732095  0.25084746 -0.36121673]
[ 0.90825688  0.5631068   0.36973684  0.14400922  0.35583942  0.35588056
  0.18582047  0.11659726  0.42402827  0.138322 

In [16]:
count = 0
for k in y_target_train:
    print(k, '\n\n')
    count += 1
    if count == 10:
        break

[2.039] 


[5.] 


[4.] 


[2.] 


[0.] 


[0.] 


[0.9784] 


[0.] 


[2.] 


[1.4361] 




# Saving Feature and Response matrices for Training, Validation and Testing datasets respectively to file for future ease of use.

In [17]:
#30-Day Window + Scaled

# Training Files

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\X_train_90_day_window_scaled.csv', X_train_averaged, delimiter=',', newline='\n')

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\y_target_train_90_day_window_scaled.csv', y_target_train, delimiter=',', newline='\n')



# Validation Files

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\X_valid_90_day_window_scaled.csv', X_valid_averaged, delimiter=',', newline='\n')

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\y_target_valid_90_day_window_scaled.csv', y_target_valid, delimiter=',', newline='\n')



# Testing Files

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\X_test_90_day_window_scaled.csv', X_test_averaged, delimiter=',', newline='\n')

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\y_target_test_90_day_window_scaled.csv', y_target_test, delimiter=',', newline='\n')
