In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import joblib

# Constants

In [2]:
FOLDER = 'fixed_0-00146'
M_COLUMNS = ['tempmax', 'tempmin', 'temp', 'dew', 'humidity', 'precip', 'precipprob', 'precipcover', 'windspeed', 'winddir', 
             'sealevelpressure', 'cloudcover', 'visibility', 'solarradiation', 'solarenergy', 'uvindex', 'moonphase']

S_COLUMNS = ['ndvi', 'savi', 'evi', 'rep', 'osavi', 'rdvi', 'mtvi1', 'lswi']
G_COLUMNS = ['D', 'T', 'Field size (ha)']

# Retrieve all weather data

In [3]:
weather_path = f'../../data/raw/weather'
chau_phu = pd.read_csv(f'{weather_path}/Chau Phu.csv')
chau_thanh = pd.read_csv(f'{weather_path}/Chau Thanh.csv')
thoai_son = pd.read_csv(f'{weather_path}/Thoai Son.csv')
weather_df = pd.concat([chau_phu, chau_thanh, thoai_son])

# remove NULL columns
null_cols = ['preciptype', 'snow', 'snowdepth', 'windgust', 'severerisk']
weather_df = weather_df.drop(columns=null_cols)

# remove useless columns 
useless_cols = ['conditions', 'description', 'icon', 'stations']
weather_df = weather_df.drop(columns=useless_cols)

# preprocess come columns
weather_df['name'] = weather_df['name'].apply(lambda x: x.replace(' ', '_'))
weather_df['datetime'] = weather_df['datetime'].apply(lambda x: f'{x.split("-")[2]}-{x.split("-")[1]}-{x.split("-")[0]}')

# Retrieve train data

In [4]:
# get VIs data
train_path = f'../../data/processed/{FOLDER}/train_vi.csv'
train_df = pd.read_csv(train_path)

# join weather data
join_train_df = train_df.merge(weather_df, how='inner', left_on=['District', 'date'], right_on=['name', 'datetime'])

# get raw data
raw_train_path = '../../data/raw/train.csv'
raw_train_df = pd.read_csv(raw_train_path)

# preprocess Rice Crop Intensity
intensity = pd.get_dummies(raw_train_df['Rice Crop Intensity(D=Double, T=Triple)'])
raw_train_df = raw_train_df.drop(columns=['Rice Crop Intensity(D=Double, T=Triple)', 'Season(SA = Summer Autumn, WS = Winter Spring)'])
raw_train_df = pd.concat([raw_train_df, intensity], axis=1)

# Retrieve test data

In [5]:
# get VIs data
test_path = f'../../data/processed/{FOLDER}/test_vi.csv'
test_df = pd.read_csv(test_path)

# join weather data
join_test_df = test_df.merge(weather_df, how='inner', left_on=['District', 'date'], right_on=['name', 'datetime'])

# get raw data
raw_test_path = '../../data/raw/test.csv'
raw_test_df = pd.read_csv(raw_test_path)

# preprocess Rice Crop Intensity
intensity = pd.get_dummies(raw_test_df['Rice Crop Intensity(D=Double, T=Triple)'])
raw_test_df = raw_test_df.drop(columns=['Rice Crop Intensity(D=Double, T=Triple)', 'Season(SA = Summer Autumn, WS = Winter Spring)'])
raw_test_df = pd.concat([raw_test_df, intensity], axis=1)

# Normalization

In [6]:
raw_cols = G_COLUMNS
join_cols = S_COLUMNS + M_COLUMNS

# RAW DATA #########################################################
raw_scaler = StandardScaler()
raw_scaler.fit(raw_train_df[raw_cols])

scaled_raw_train_df = pd.DataFrame(raw_scaler.transform(raw_train_df[raw_cols]), columns=raw_cols)
raw_train_df = raw_train_df.drop(columns=raw_cols)
raw_train_df = pd.concat([raw_train_df, scaled_raw_train_df], axis=1)

scaled_raw_test_df = pd.DataFrame(raw_scaler.transform(raw_test_df[raw_cols]), columns=raw_cols)
raw_test_df = raw_test_df.drop(columns=raw_cols)
raw_test_df = pd.concat([raw_test_df, scaled_raw_test_df], axis=1)

# JOIN DATA #########################################################
join_scaler = StandardScaler()
join_scaler.fit(join_train_df[join_cols])

scaled_join_train_df = pd.DataFrame(join_scaler.transform(join_train_df[join_cols]), columns=join_cols)
join_train_df = join_train_df.drop(columns=join_cols)
join_train_df = pd.concat([join_train_df, scaled_join_train_df], axis=1)

scaled_join_test_df = pd.DataFrame(join_scaler.transform(join_test_df[join_cols]), columns=join_cols)
join_test_df = join_test_df.drop(columns=join_cols)
join_test_df = pd.concat([join_test_df, scaled_join_test_df], axis=1)

# Scale labels

In [7]:
label_col = ['Rice Yield (kg/ha)']
label_scaler = MinMaxScaler()
label_scaler.fit(raw_train_df[label_col])

label_train_df = pd.DataFrame(label_scaler.transform(raw_train_df[label_col]), columns=label_col)
raw_train_df = raw_train_df.drop(columns=label_col)
raw_train_df = pd.concat([raw_train_df, label_train_df], axis=1)

# SAVE THE SCALER TO INVERT TRANSFORM
joblib.dump(label_scaler, f'../../data/processed/lstm/{FOLDER}/label_scaler.joblib')

['../../data/processed/lstm/fixed_0-00146/label_scaler.joblib']

# Save data as CSV files

In [8]:
raw_train_df.to_csv(f'../../data/processed/lstm/{FOLDER}/raw_train.csv', index=False)
join_train_df.to_csv(f'../../data/processed/lstm/{FOLDER}/join_train.csv', index=False)

raw_test_df.to_csv(f'../../data/processed/lstm/{FOLDER}/raw_test.csv', index=False)
join_test_df.to_csv(f'../../data/processed/lstm/{FOLDER}/join_test.csv', index=False)