# Loading the unprocessed dataset into Pandas dataframes.

In [1]:
import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.style.use('seaborn-poster')

from scipy.interpolate import interp1d
from sklearn.preprocessing import RobustScaler

In [2]:
datasetsFolder = {}

for dirname, _, filenames in os.walk(r'R:\Ryerson\Misc\Datasets\Predict Droughts using Weather & Soil Data\\'):
    for filename in filenames:
        if 'train' in filename:
            datasetsFolder['train'] = os.path.join(dirname, filename)
        if 'valid' in filename:
            datasetsFolder['valid'] = os.path.join(dirname, filename)
        if 'test' in filename:
            datasetsFolder['test'] = os.path.join(dirname, filename)

In [3]:
allDatasets = { k: pd.read_csv(datasetsFolder[k]).set_index(['fips', 'date'])
    for k in datasetsFolder.keys()}

# Pre-processing data to create feature and response matrices.

In [4]:
def interpolate_NaN_values(sourceArray, interpolationKind='linear'):
    allIndexes = np.arange(sourceArray.shape[0])
    allGoodIndexes, = np.where(np.isfinite(sourceArray))
    f = interp1d(allGoodIndexes,
                 sourceArray[allGoodIndexes],
                 bounds_error=False,
                 copy=False,
                 fill_value='extrapolate',
                 kind=interpolationKind)
    return f(allIndexes)

In [5]:
def loadXYMatrices(
    df,
    randomState = 42, # random state is kept at 42 as per convention
    windowSize = 30, # decides number of days per each output sample for which the corresponding drought score is returned
    targetSize = 1, # decides how many weeks' worth of drought scores are returned starting the first day of the week following windowSize 
    ):
        rawDF = allDatasets[df]

        soilDF = pd.read_csv(r'R:\Ryerson\Misc\Datasets\Predict Droughts using Weather & Soil Data\soil_data.csv')

        timeSeriesDataColumns = sorted([c for c in rawDF.columns if c not in ["fips", "date", "score"]])
        #timeSeriesDataColumns are the 18 meteorological indicators
        print('\ntimeSeriesDataColumns: \n')
        for i_1 in timeSeriesDataColumns:
            print(i_1, '\n')

        #staticDataColumns are the 29 soil data indicators
        staticDataColumns = sorted([c for c in soilDF.columns if c not in ["fips", "lat", "lon"]])
        '''print('\nstaticDataColumns: ')
        for i_2 in staticDataColumns:
            print(i_2, '\n')'''

        count = 0
        scoreDF = rawDF.dropna(subset=["score"])

        X_static = np.empty((len(rawDF) // windowSize, len(staticDataColumns)))
        # the shape of this uninitialized array will be (19300680/windowSize, 28)
        '''print('X_static Shape: ', X_static.shape)'''

        X_time = np.empty((len(rawDF) // windowSize, windowSize, len(timeSeriesDataColumns))) 
        # the shape of this uninitialized array will be (19300680/windowSize, windowSize, 18)
        print('X_time Shape: ', X_time.shape)

        y_past = np.empty((len(rawDF) // windowSize, windowSize))
        # the shape of this uninitialized array will be (19300680/windowSize, windowSize)
        print('y_past Shape: ', y_past.shape)

        y_target = np.empty((len(rawDF) // windowSize, targetSize))
        # the shape of this uninitialized array will be (19300680/windowSize, targetSize)
        print('y_target Shape: ', y_target.shape)

        if randomState is not None:
            np.random.seed(randomState)
            
        for uniqueFIPScode in tqdm(scoreDF.index.get_level_values(0).unique()): #for every unique FIPS county code
            
            startingPoint = 1
            
            fipsDF = rawDF[(rawDF.index.get_level_values(0) == uniqueFIPScode)] #store the df sample at [index = current unique value of fips] 
            X = fipsDF[timeSeriesDataColumns].values #individual X = current sample values of the 18 meteorological columns 
            y = fipsDF["score"].values #individual y = current samples' values of the column 'score' as ndarray
            XStat = soilDF[soilDF["fips"] == uniqueFIPScode][staticDataColumns].values[0] #individual soil data sample = return as ndarray all the 28 column values minus the axis labels
            
            for i in range(startingPoint, len(y) - (windowSize + targetSize * 7), windowSize):
                X_time[count, :, : len(timeSeriesDataColumns)] = X[i : i + windowSize]
                y_past[count] = interpolate_NaN_values(y[i : i + windowSize])
                tempY = y[i + windowSize : i + windowSize + targetSize * 7]
                y_target[count] = np.array(tempY[~np.isnan(tempY)][:targetSize])
                X_static[count] = XStat
                count += 1
        
        print(f"\n\n-----------------------------------\nLoaded {count} samples successfully.\n-----------------------------------\n\n")
        matrices = [X_time[:count], y_target[:count]]
        
        #If you wish to inculcate Soil data in the training of your model and require the Soil data array for the same then uncomment the following line of code: 
        #matrices.append(X_static[:count])
        
        #If you require the interpolated past drought values going back the duration of the window size then uncomment the following line of code: 
        matrices.append(y_past[:count])
        
        return matrices

# Defining function to scale features using RobustScaler.

In [6]:
scalerDict = {}

def scaleFeatures(sourceArray, fit=False):
    for feature in tqdm(range(sourceArray.shape[-1])): #printing a progress bar for each of the meteorological indicators
        if fit:
            scalerDict[feature] = RobustScaler().fit(sourceArray[:, feature].reshape(-1, 1))
        sourceArray[:, feature] = scalerDict[feature].transform(sourceArray[:, feature].reshape(-1, 1)).reshape(1, -1)
    return sourceArray

# Creating the Feature and Response matrices.

In [7]:
# There are 3108 counties in the training dataset.

# For each county, there are 6210 observations i.e. there is one observation for every day of 2001-2017. 

# Meaning there are a total of 3108 x 6210 = 19,300,680 observations in the training dataset.

In [8]:
X_train_unscaled, y_target_train, y_past_train = loadXYMatrices("train")
X_valid_unscaled, y_target_valid, y_past_valid = loadXYMatrices("valid")
X_test_unscaled, y_target_test, y_past_test = loadXYMatrices("test")


timeSeriesDataColumns: 

PRECTOT 

PS 

QV2M 

T2M 

T2MDEW 

T2MWET 

T2M_MAX 

T2M_MIN 

T2M_RANGE 

TS 

WS10M 

WS10M_MAX 

WS10M_MIN 

WS10M_RANGE 

WS50M 

WS50M_MAX 

WS50M_MIN 

WS50M_RANGE 

X_time Shape:  (643356, 30, 18)
y_past Shape:  (643356, 30)
y_target Shape:  (643356, 1)


  0%|          | 0/3108 [00:00<?, ?it/s]



-----------------------------------
Loaded 640248 samples successfully.
-----------------------------------



timeSeriesDataColumns: 

PRECTOT 

PS 

QV2M 

T2M 

T2MDEW 

T2MWET 

T2M_MAX 

T2M_MIN 

T2M_RANGE 

TS 

WS10M 

WS10M_MAX 

WS10M_MIN 

WS10M_RANGE 

WS50M 

WS50M_MAX 

WS50M_MIN 

WS50M_RANGE 

X_time Shape:  (75628, 30, 18)
y_past Shape:  (75628, 30)
y_target Shape:  (75628, 1)


  0%|          | 0/3108 [00:00<?, ?it/s]



-----------------------------------
Loaded 74592 samples successfully.
-----------------------------------



timeSeriesDataColumns: 

PRECTOT 

PS 

QV2M 

T2M 

T2MDEW 

T2MWET 

T2M_MAX 

T2M_MIN 

T2M_RANGE 

TS 

WS10M 

WS10M_MAX 

WS10M_MIN 

WS10M_RANGE 

WS50M 

WS50M_MAX 

WS50M_MIN 

WS50M_RANGE 

X_time Shape:  (75731, 30, 18)
y_past Shape:  (75731, 30)
y_target Shape:  (75731, 1)


  0%|          | 0/3108 [00:00<?, ?it/s]



-----------------------------------
Loaded 74592 samples successfully.
-----------------------------------




In [10]:
print('\nX_train_unscaled.shape: ', X_train_unscaled.shape)
print('\ny_target_train.shape: ', y_target_train.shape)
print('\ny_past_train.shape: ', y_past_train.shape)


print('\n\n\n')

print('\nX_valid_unscaled.shape: ', X_valid_unscaled.shape)
print('\ny_target_valid.shape: ', y_target_valid.shape)
print('\ny_past_valid.shape: ', y_past_valid.shape)

print('\n\n\n')

print('\nX_test_unscaled.shape: ', X_test_unscaled.shape)
print('\ny_target_test.shape: ', y_target_test.shape)
print('\ny_past_test.shape: ', y_past_test.shape)


X_train_unscaled.shape:  (640248, 30, 18)

y_target_train.shape:  (640248, 1)

y_past_train.shape:  (640248, 30)





X_valid_unscaled.shape:  (74592, 30, 18)

y_target_valid.shape:  (74592, 1)

y_past_valid.shape:  (74592, 30)





X_test_unscaled.shape:  (74592, 30, 18)

y_target_test.shape:  (74592, 1)

y_past_test.shape:  (74592, 30)


In [11]:
X_train = np.empty((X_train_unscaled.shape[0], X_train_unscaled.shape[-1]))
i = 0
for window in X_train_unscaled:
    X_train[i] = np.mean(window, axis=0)
    i += 1
print(i)

640248


In [12]:
X_train.shape

count = 0
for k in X_train:
    print(k, '\n\n')
    count += 1
    if count == 1:
        break

(640248, 18)

[  3.87433333 100.728        5.518        7.20866667   3.30933333
   3.35966667  13.22333333   1.674       11.54933333   7.04033333
   2.57666667   3.658        1.55733333   2.10066667   4.931
   6.79066667   3.02666667   3.76333333] 




In [13]:
X_valid = np.empty((X_valid_unscaled.shape[0], X_valid_unscaled.shape[-1]))
i = 0
for window in X_valid_unscaled:
    X_valid[i] = np.mean(window, axis=0)
    i += 1
print(i)

74592


In [14]:
X_valid.shape

count = 0
for k in X_valid:
    print(k, '\n\n')
    count += 1
    if count == 1:
        break

(74592, 18)

[  7.23266667 100.37433333   7.52633333  10.39333333   7.88666667
   7.901       16.18866667   5.25066667  10.93933333  10.03666667
   2.459        3.66233333   1.52666667   2.13433333   4.815
   6.813        2.87466667   3.939     ] 




In [15]:
X_test = np.empty((X_test_unscaled.shape[0], X_test_unscaled.shape[-1]))
i = 0
for window in X_test_unscaled:
    X_test[i] = np.mean(window, axis=0)
    i += 1
print(i)

74592


In [16]:
X_test.shape

count = 0
for k in X_test:
    print(k, '\n\n')
    count += 1
    if count == 1:
        break

(74592, 18)

[  4.68166667 100.58033333   5.438        6.848        4.14733333
   3.96933333  12.769        1.37733333  11.39233333   6.516
   2.37033333   3.42366667   1.386        2.03733333   4.45066667
   6.45333333   2.67933333   3.77433333] 




# Scaling Feature values

In [17]:
X_train = scaleFeatures(X_train, fit=True)
X_valid = scaleFeatures(X_valid)
X_test = scaleFeatures(X_test)

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

In [18]:
count = 0
for k in X_train:
    print(k)
    #for j in k:
    #    print(j)
    count += 1
    print('\n\n\n')
    if count == 30:
        break

[ 0.6430508   0.59958472 -0.22079382 -0.39863242 -0.26808073 -0.26667869
 -0.413065   -0.39376374  0.0503905  -0.40338093 -0.38979257 -0.42962663
 -0.27108612 -0.47108434 -0.20118177 -0.25959251 -0.03774864 -0.38050089]




[-0.36836473  0.61013289 -0.06471796 -0.17432537 -0.05455536 -0.05458481
 -0.09967469 -0.24031639  0.93848294 -0.19138032 -0.46992986 -0.51543054
 -0.34980764 -0.55813253 -0.29783343 -0.32081911 -0.25293852 -0.31520572]




[ 7.33661530e-01  4.74418605e-01  1.57884446e-01  5.15894013e-02
  1.87281276e-01  1.87292855e-01  1.02301429e-01  1.70331454e-04
  7.99293891e-01  3.30323300e-02 -5.06193106e-01 -5.46190428e-01
 -4.47173720e-01 -5.55271084e-01 -3.76617895e-01 -3.75530044e-01
 -2.91817360e-01 -3.79427549e-01]




[ 0.51919235  0.46171096  0.22558412  0.12932964  0.25104903  0.25114423
  0.13913641  0.08335596  0.57665561  0.11541393 -0.43321892 -0.43944147
 -0.54098846 -0.34894578 -0.27940349 -0.29599752 -0.44077758 -0.1236136 ]




[-0.4868008   0.44244186  0.68

In [21]:
count = 0
for k in y_target_train:
    print(k, '\n')
    count += 1
    if count == 10:
        break

[1.] 

[1.] 

[1.] 

[1.0481] 

[2.0391] 

[3.] 

[5.] 

[5.] 

[4.7207] 

[5.] 



# Fusing Past Drought values

## For X_train

In [27]:
y_past_train_trimmed = np.delete(y_past_train, np.s_[1::], 1)

In [35]:
y_past_train_trimmed.shape

(640248, 1)

In [36]:
X_train_concatenatedWithPastDroughtValues = np.concatenate((X_train, y_past_train_trimmed), axis=1)

In [37]:
X_train_concatenatedWithPastDroughtValues.shape

(640248, 19)

## For X_valid

In [39]:
y_past_valid_trimmed = np.delete(y_past_valid, np.s_[1::], 1)

In [40]:
y_past_valid_trimmed.shape

(74592, 1)

In [41]:
X_valid_concatenatedWithPastDroughtValues = np.concatenate((X_valid, y_past_valid_trimmed), axis=1)

In [42]:
X_valid_concatenatedWithPastDroughtValues.shape

(74592, 19)

## For X_test

In [43]:
y_past_test_trimmed = np.delete(y_past_test, np.s_[1::], 1)

In [44]:
y_past_test_trimmed.shape

(74592, 1)

In [45]:
X_test_concatenatedWithPastDroughtValues = np.concatenate((X_test, y_past_test_trimmed), axis=1)

In [46]:
X_test_concatenatedWithPastDroughtValues.shape

(74592, 19)

# Saving Feature and Response matrices for Training, Validation and Testing datasets respectively to file for future ease of use.

In [51]:
#30-Day Window + Scaled

# Training Files

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\X_train_30_day_window_scaled_+_pastDroughtValues.csv', X_train_concatenatedWithPastDroughtValues, delimiter=',', newline='\n')

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\y_target_train_30_day_window.csv', y_target_train, delimiter=',', newline='\n')



# Validation Files

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\X_valid_30_day_window_scaled_+_pastDroughtValues.csv', X_valid_concatenatedWithPastDroughtValues, delimiter=',', newline='\n')

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\y_target_valid_30_day_window.csv', y_target_valid, delimiter=',', newline='\n')



# Testing Files

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\X_test_30_day_window_scaled_+_pastDroughtValues.csv', X_test_concatenatedWithPastDroughtValues, delimiter=',', newline='\n')

np.savetxt('R:\Ryerson\Misc\Datasets\Preprocessed Data Files\y_target_test_30_day_window.csv', y_target_test, delimiter=',', newline='\n')
