# Loading the unprocessed dataset into Pandas dataframes.

In [1]:
# Loading the unprocessed dataset into Pandas dataframes.import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from scipy.interpolate import interp1d
from sklearn.preprocessing import RobustScaler

datasetsFolder = {}

for dirname, _, filenames in os.walk(r'R:\Ryerson\Misc\Datasets\Predict Droughts using Weather & Soil Data\\'):
    for filename in filenames:
        if 'train' in filename:
            datasetsFolder['train'] = os.path.join(dirname, filename)
        if 'valid' in filename:
            datasetsFolder['valid'] = os.path.join(dirname, filename)
        if 'test' in filename:
            datasetsFolder['test'] = os.path.join(dirname, filename)

In [2]:
allDatasets = { k: pd.read_csv(datasetsFolder[k]).set_index(['fips', 'date'])
    for k in datasetsFolder.keys()}

# Pre-processing data to create feature and response matrices.

In [3]:
def interpolate_NaN_values(sourceArray, interpolationKind='linear'):
    allIndexes = np.arange(sourceArray.shape[0])
    allGoodIndexes, = np.where(np.isfinite(sourceArray))
    f = interp1d(allGoodIndexes,
                 sourceArray[allGoodIndexes],
                 bounds_error=False,
                 copy=False,
                 fill_value='extrapolate',
                 kind=interpolationKind)
    return f(allIndexes)

In [4]:
def loadXYMatrices(
    df,
    randomState = 42, # random state is kept at 42 as per convention
    windowSize = 30, # decides number of days per each output sample for which the corresponding drought score is returned
    targetSize = 1, # decides how many weeks' worth of drought scores are returned starting the first day of the week following windowSize 
    ):
        rawDF = allDatasets[df]

        soilDF = pd.read_csv(r'R:\Ryerson\Misc\Datasets\Predict Droughts using Weather & Soil Data\soil_data.csv')

        timeSeriesDataColumns = sorted([c for c in rawDF.columns if c not in ["fips", "date", "score"]])
        #timeSeriesDataColumns are the 18 meteorological indicators
        print('\ntimeSeriesDataColumns: \n')
        for i_1 in timeSeriesDataColumns:
            print(i_1, '\n')

        #staticDataColumns are the 29 soil data indicators
        staticDataColumns = sorted([c for c in soilDF.columns if c not in ["fips", "lat", "lon"]])
        '''print('\nstaticDataColumns: ')
        for i_2 in staticDataColumns:
            print(i_2, '\n')'''

        count = 0
        scoreDF = rawDF.dropna(subset=["score"])

        X_static = np.empty((len(rawDF) // windowSize, len(staticDataColumns)))
        # the shape of this uninitialized array will be (19300680/windowSize, 28)
        '''print('X_static Shape: ', X_static.shape)'''

        X_time = np.empty((len(rawDF) // windowSize, windowSize, len(timeSeriesDataColumns))) 
        # the shape of this uninitialized array will be (19300680/windowSize, windowSize, 18)
        print('X_time Shape: ', X_time.shape)

        y_past = np.empty((len(rawDF) // windowSize, windowSize))
        # the shape of this uninitialized array will be (19300680/windowSize, windowSize)
        '''print('y_past Shape: ', y_past.shape)'''

        y_target = np.empty((len(rawDF) // windowSize, targetSize))
        # the shape of this uninitialized array will be (19300680/windowSize, targetSize)
        print('y_target Shape: ', y_target.shape)

        if randomState is not None:
            np.random.seed(randomState)
            
        for uniqueFIPScode in tqdm(scoreDF.index.get_level_values(0).unique()): #for every unique FIPS county code
            
            if randomState is not None:
                startingPoint = np.random.randint(1, windowSize) #return random integers from 1 to 30 (30 not included)
            else:
                startingPoint = 1
            
            fipsDF = rawDF[(rawDF.index.get_level_values(0) == uniqueFIPScode)] #store the df sample at [index = current unique value of fips] 
            X = fipsDF[timeSeriesDataColumns].values #individual X = current sample values of the 18 meteorological columns 
            y = fipsDF["score"].values #individual y = current samples' values of the column 'score' as ndarray
            XStat = soilDF[soilDF["fips"] == uniqueFIPScode][staticDataColumns].values[0] #individual soil data sample = return as ndarray all the 29 column values minus the axis labels
            
            for i in range(startingPoint, len(y) - (windowSize + targetSize * 7), windowSize):
                X_time[count, :, : len(timeSeriesDataColumns)] = X[i : i + windowSize]
                y_past[count] = interpolate_NaN_values(y[i : i + windowSize])
                tempY = y[i + windowSize : i + windowSize + targetSize * 7]
                y_target[count] = np.array(tempY[~np.isnan(tempY)][:targetSize])
                X_static[count] = XStat
                count += 1
        
        print(f"\n\n-----------------------------------\nLoaded {count} samples successfully.\n-----------------------------------\n\n")
        matrices = [X_time[:count], y_target[:count]]
        #If you wish to inculcate Soil data in the training of your model and require the Soil data array for the same then uncomment the following line of code: 
        #matrices.append(X_static[:count])
        #If you require the interpolated past drought values for every day in the dataset then uncomment the following line of code: 
        #matrices.append(y_past[:count])
        return matrices

# Defining function to scale features using RobustScaler.

In [5]:
# Defining function to scale features using RobustScaler.scalerDict = {}

def scaleFeatures(sourceArray, fit=False):
    for index in tqdm(range(sourceArray.shape[-1])): #printing a progress bar for each of the 18 meteorological indicators
        if fit:
            scalerDict[index] = RobustScaler().fit(sourceArray[:, :, index].reshape(-1, 1))
        sourceArray[:, :, index] = (scalerDict[index].transform(sourceArray[:, :, index].reshape(-1, 1)).reshape(-1, sourceArray.shape[-2]))
    return sourceArray

# Creating the Feature and Response matrices.

In [6]:
# There are 3108 counties in the training dataset.

# For each county, there are 6210 observations i.e. there is one observation for every day of 2001-2017. 

# Meaning there are a total of 3108 x 6210 = 19,300,680 observations in the training dataset.

In [7]:
# Creating the Feature and Response matrices.X_train, y_target_train = loadXYMatrices("train")
X_valid, y_target_valid = loadXYMatrices("valid")
X_test, y_target_test = loadXYMatrices("test")

#Normalizing values:
X_train = scaleFeatures(X_train, fit=True)
X_valid = scaleFeatures(X_valid)
X_test = scaleFeatures(X_test)


timeSeriesDataColumns: 

PRECTOT 

PS 

QV2M 

T2M 

T2MDEW 

T2MWET 

T2M_MAX 

T2M_MIN 

T2M_RANGE 

TS 

WS10M 

WS10M_MAX 

WS10M_MIN 

WS10M_RANGE 

WS50M 

WS50M_MAX 

WS50M_MIN 

WS50M_RANGE 

X_time Shape:  (643356, 30, 18)
y_target Shape:  (643356, 1)


  0%|          | 0/3108 [00:00<?, ?it/s]



-----------------------------------
Loaded 639476 samples successfully.
-----------------------------------



timeSeriesDataColumns: 

PRECTOT 

PS 

QV2M 

T2M 

T2MDEW 

T2MWET 

T2M_MAX 

T2M_MIN 

T2M_RANGE 

TS 

WS10M 

WS10M_MAX 

WS10M_MIN 

WS10M_RANGE 

WS50M 

WS50M_MAX 

WS50M_MIN 

WS50M_RANGE 

X_time Shape:  (75628, 30, 18)
y_target Shape:  (75628, 1)


  0%|          | 0/3108 [00:00<?, ?it/s]



-----------------------------------
Loaded 71721 samples successfully.
-----------------------------------



timeSeriesDataColumns: 

PRECTOT 

PS 

QV2M 

T2M 

T2MDEW 

T2MWET 

T2M_MAX 

T2M_MIN 

T2M_RANGE 

TS 

WS10M 

WS10M_MAX 

WS10M_MIN 

WS10M_RANGE 

WS50M 

WS50M_MAX 

WS50M_MIN 

WS50M_RANGE 

X_time Shape:  (75731, 30, 18)
y_target Shape:  (75731, 1)


  0%|          | 0/3108 [00:00<?, ?it/s]



-----------------------------------
Loaded 71841 samples successfully.
-----------------------------------




  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

In [8]:
print('\nX_train.shape: ', X_train.shape)
print('\ny_target_train.shape: ', y_target_train.shape)

print('\n\n\n')

print('\nX_valid.shape: ', X_valid.shape)
print('\ny_target_valid.shape: ', y_target_valid.shape)

print('\n\n\n')

print('\nX_test.shape: ', X_test.shape)
print('\ny_target_test.shape: ', y_target_test.shape)



X_train.shape:  (639476, 30, 18)

y_target_train.shape:  (639476, 1)





X_valid.shape:  (71721, 30, 18)

y_target_valid.shape:  (71721, 1)





X_test.shape:  (71841, 30, 18)

y_target_test.shape:  (71841, 1)


In [19]:
X_train_averaged = np.empty((X_train.shape[0], X_train.shape[-1]))
i = 0
for window in X_train:
    X_train_averaged[i] = np.mean(window, axis=0)
    i += 1
print(i)

639476


In [20]:
X_train_averaged.shape

count = 0
for k in X_train_averaged:
    print(k, '\n\n')
    count += 1
    if count == 1:
        break

(639476, 18)

[ 1.39800307  0.60154096 -0.26569793 -0.48020713 -0.36121581 -0.35925249
 -0.4826484  -0.47684524  0.12108363 -0.48367308 -0.26233062 -0.30578512
 -0.09485597 -0.30713306 -0.11753247 -0.17701149  0.0299435  -0.18042929] 




In [21]:
X_valid_averaged = np.empty((X_valid.shape[0], X_valid.shape[-1]))
i = 0
for window in X_valid:
    X_valid_averaged[i] = np.mean(window, axis=0)
    i += 1
print(i)

71721


In [22]:
X_valid_averaged.shape

count = 0
for k in X_valid_averaged:
    print(k, '\n\n')
    count += 1
    if count == 1:
        break

(71721, 18)

[ 1.91843318  0.52879157  0.10510788 -0.17437668  0.05560284  0.05575868
 -0.18786149 -0.14210317  0.0401649  -0.19205361 -0.33211382 -0.32488522
 -0.13106996 -0.31179698 -0.16168831 -0.17214854 -0.01446328 -0.12348485] 




In [23]:
X_test_averaged = np.empty((X_test.shape[0], X_test.shape[-1]))
i = 0
for window in X_test:
    X_test_averaged[i] = np.mean(window, axis=0)
    i += 1
print(i)

71841


In [24]:
X_test_averaged.shape

count = 0
for k in X_test_averaged:
    print(k, '\n\n')
    count += 1
    if count == 1:
        break

(71841, 18)

[ 1.48310292  0.562206   -0.14161162 -0.3888761  -0.17649443 -0.18740605
 -0.40340563 -0.37599206  0.06690224 -0.40079275 -0.32723577 -0.37098255
 -0.15041152 -0.36762689 -0.20227273 -0.24137931 -0.02960452 -0.20530303] 




In [25]:
count = 0
for k in X_train:
    #print(k)
    for j in k:
        print(j)
    count += 1
    print('\n\n\n')
    if count == 1:
        break

[ 0.38709677  0.60097324 -0.15323646 -0.31127733 -0.14042553 -0.14076782
 -0.26883562 -0.31964286  0.31625442 -0.30464326 -0.44308943 -0.48209366
 -0.09876543 -0.57201646 -0.30519481 -0.41644562 -0.25762712 -0.20075758]
[ 7.37788018  0.38442822  0.56935271  0.09378596  0.47477204  0.47531993
 -0.05079909  0.13035714 -0.34275618  0.08607022 -0.04471545 -0.18732782
  0.36419753 -0.43621399  0.27922078  0.03713528  0.73898305 -0.66666667]
[ 9.70046083  0.36009732  0.11889036 -0.02819333  0.11854103  0.11882998
 -0.16210046 -0.02261905 -0.23498233 -0.04020385 -0.01626016  0.25895317
  0.03703704  0.44855967  0.3961039   0.72413793  0.46779661  0.61742424]
[ 0.52995392  0.51094891 -0.0317041  -0.15074799  0.01215805  0.01157831
 -0.09189498 -0.1422619   0.33745583 -0.20951302 -0.56097561 -0.60055096
 -0.37654321 -0.55967078 -0.50649351 -0.24403183 -0.35932203  0.15909091]
[-0.00460829  0.55717762  0.2007926  -0.06214039  0.22613982  0.22608166
  0.01426941 -0.1         0.54063604 -0.0690826

In [26]:
count = 0
for k in y_target_train:
    print(k, '\n\n')
    count += 1
    if count == 10:
        break

[1.] 


[1.4905] 


[0.929] 


[1.9995] 


[2.0391] 


[3.9795] 


[5.] 


[5.] 


[4.7206] 


[4.] 




# Saving Feature and Response matrices for Training, Validation and Testing datasets respectively to file for future ease of use.

In [27]:
#30-Day Window + Scaled

# Training Files

# Saving Feature and Response matrices for Training, Validation and Testing datasets respectively to file for future ease of use.
np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\y_target_train_30_day_window_scaled.csv', y_target_train, delimiter=',', newline='\n')



# Validation Files

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\X_valid_30_day_window_scaled.csv', X_valid_averaged, delimiter=',', newline='\n')

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\y_target_valid_30_day_window_scaled.csv', y_target_valid, delimiter=',', newline='\n')



# Testing Files

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\X_test_30_day_window_scaled.csv', X_test_averaged, delimiter=',', newline='\n')

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\y_target_test_30_day_window_scaled.csv', y_target_test, delimiter=',', newline='\n')
