In [5]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import xarray as xr
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import joblib

In [2]:
FOLDER = 'augment_10_5'

# Create m inputs

In [3]:
weather_path = f'../../data/raw/weather'
chau_phu = pd.read_csv(f'{weather_path}/Chau Phu.csv')
chau_thanh = pd.read_csv(f'{weather_path}/Chau Thanh.csv')
thoai_son = pd.read_csv(f'{weather_path}/Thoai Son.csv')
weather_df = pd.concat([chau_phu, chau_thanh, thoai_son])

# remove NULL columns
null_cols = ['preciptype', 'snow', 'snowdepth', 'windgust', 'severerisk']
weather_df = weather_df.drop(columns=null_cols)

# remove useless columns 
useless_cols = ['conditions', 'description', 'icon', 'stations', 'sunrise', 'sunset'] 
weather_df = weather_df.drop(columns=useless_cols)

# preprocess come columns
weather_df['name'] = weather_df['name'].apply(lambda x: x.replace(' ', '_'))
weather_df['datetime'] = weather_df['datetime'].apply(lambda x: f'{x.split("-")[2]}-{x.split("-")[1]}-{x.split("-")[0]}')

m_columns = [e for e in weather_df.columns if e not in ['name', 'datetime']]
m_scaler = StandardScaler()
m_scaler.fit(weather_df[m_columns])
weather_df[m_columns] = m_scaler.transform(weather_df[m_columns])

# Create s inputs

In [30]:
train_vi_path = f'../../data/processed/{FOLDER}/train_filter_vi_fill.nc'
train_xdf = xr.open_dataset(train_vi_path)
train_xdf = train_xdf.drop(['time', 'Season(SA = Summer Autumn, WS = Winter Spring)'])
train_vi_df = train_xdf.to_dataframe().reset_index()
train_vi_df = train_vi_df.drop(columns='ts_id')

useless_cols = ['Rice Crop Intensity(D=Double, T=Triple)', 'Field size (ha)', 'Rice Yield (kg/ha)']
train_vi_df = train_vi_df.drop(columns=useless_cols)

In [31]:
train_vi_df.max()

state_dev                    23
District              Thoai_Son
Date of Harvest      28-07-2022
Latitude              10.676956
Longitude            105.424059
ndvi                   0.925801
savi                   1.462852
evi                 133458.4375
rep                87961.242188
osavi                   0.92577
rdvi                  73.268044
mtvi1               9480.698242
lswi                   0.982012
dtype: object

In [25]:
test_vi_path = f'../../data/processed/{FOLDER}/test_filter_vi_fill_smooth.nc'
test_xdf = xr.open_dataset(test_vi_path)
test_xdf = test_xdf.drop(['time', 'Season(SA = Summer Autumn, WS = Winter Spring)'])
test_vi_df = test_xdf.to_dataframe().reset_index()
test_vi_df = test_vi_df.drop(columns='ts_id')

useless_cols = ['Rice Crop Intensity(D=Double, T=Triple)', 'Field size (ha)', 'Predicted Rice Yield (kg/ha)']
test_vi_df = test_vi_df.drop(columns=useless_cols)

In [27]:
remove_columns = ['District', 'Latitude', 'Longitude', 'Date of Harvest', 'date']
s_columns = [e for e in train_vi_df.columns if e not in remove_columns]

s_scaler = StandardScaler()
s_scaler.fit(train_vi_df[s_columns])
train_vi_df[s_columns] = s_scaler.transform(train_vi_df[s_columns])
test_vi_df[s_columns] = s_scaler.transform(test_vi_df[s_columns])

ValueError: Input X contains infinity or a value too large for dtype('float64').

# Create g inputs + label

In [5]:
train_path = '../../data/raw/train.csv'
test_path = '../../data/raw/test.csv'
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

crop_intensity = 'Rice Crop Intensity(D=Double, T=Triple)'
train_df[crop_intensity] = train_df[crop_intensity].replace('D', 0)
train_df[crop_intensity] = train_df[crop_intensity].replace('T', 1)
test_df[crop_intensity] = test_df[crop_intensity].replace('D', 0)
test_df[crop_intensity] = test_df[crop_intensity].replace('T', 1)

g_columns = [crop_intensity, 'Field size (ha)']
g_scaler = StandardScaler()
g_scaler.fit(train_df[g_columns])
train_df[g_columns] = g_scaler.transform(train_df[g_columns])
test_df[g_columns] = g_scaler.transform(test_df[g_columns])

label_scaler = MinMaxScaler()
train_df['Rice Yield (kg/ha)'] = label_scaler.fit_transform(train_df[['Rice Yield (kg/ha)']])

# Save all 

In [6]:
weather_df.to_csv(f'../../data/processed/lstm/{FOLDER}/weather_df.csv', index=False)

train_df.to_csv(f'../../data/processed/lstm/{FOLDER}/train_df.csv', index=False)
train_vi_df.to_csv(f'../../data/processed/lstm/{FOLDER}/train_vi.csv', index=False)

test_df.to_csv(f'../../data/processed/lstm/{FOLDER}/test_df.csv', index=False)
test_vi_df.to_csv(f'../../data/processed/lstm/{FOLDER}/test_vi.csv', index=False)

joblib.dump(label_scaler, f'../../data/processed/lstm/{FOLDER}/label_scaler.joblib')

['../../data/processed/lstm/fixed_0-00146/label_scaler.joblib']