In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import torch

import warnings
warnings.filterwarnings("ignore")

# Constants

In [2]:
m_columns = ['tempmax', 'tempmin', 'temp', 'dew', 'humidity', 'precip', 'precipprob', 'precipcover', 'windspeed', 'winddir', 'sealevelpressure', 
             'cloudcover', 'visibility', 'solarradiation', 'solarenergy', 'uvindex', 'sunrise', 'sunset', 'moonphase']

s_columns = ['ndvi', 'savi', 'evi', 'rep', 'osavi', 'rdvi', 'mtvi1', 'lswi']
g_columns = ['D', 'T', 'Field size (ha)']
j_columns = ['District', 'Latitude', 'Longitude', 'Date of Harvest']
l_column = 'Field size (ha)'

# Retrieve all weather data

In [3]:
weather_path = f'../../data/raw/weather'
chau_phu = pd.read_csv(f'{weather_path}/Chau Phu.csv')
chau_thanh = pd.read_csv(f'{weather_path}/Chau Thanh.csv')
thoai_son = pd.read_csv(f'{weather_path}/Thoai Son.csv')
weather_df = pd.concat([chau_phu, chau_thanh, thoai_son])

# remove NULL columns
null_cols = ['preciptype', 'snow', 'snowdepth', 'windgust', 'severerisk']
weather_df = weather_df.drop(columns=null_cols)

# remove useless columns 
useless_cols = ['conditions', 'description', 'icon', 'stations']
weather_df = weather_df.drop(columns=useless_cols)

# preprocess come columns
weather_df['name'] = weather_df['name'].apply(lambda x: x.replace(' ', '_'))
weather_df['datetime'] = weather_df['datetime'].apply(lambda x: f'{x.split("-")[2]}-{x.split("-")[1]}-{x.split("-")[0]}')
weather_df['sunrise'] = pd.to_datetime(weather_df['sunrise'])
weather_df['sunrise'] = weather_df['sunrise'].apply(lambda x: 60 * x.hour + x.minute)
weather_df['sunset'] = pd.to_datetime(weather_df['sunset'])
weather_df['sunset'] = weather_df['sunset'].apply(lambda x: 60 * x.hour + x.minute)

# Retrieve train data

In [4]:
# get VIs data
folder = 'adaptative'
train_path = f'../../data/processed/{folder}/train_vi.csv'
train_df = pd.read_csv(train_path)

# join weather data
join_train_df = train_df.merge(weather_df, how='inner', left_on=['District', 'date'], right_on=['name', 'datetime'])

# get raw data
raw_train_path = f'../../data/raw/train.csv'
raw_train_df = pd.read_csv(raw_train_path)

# preprocess Rice Crop Intensity
intensity = pd.get_dummies(raw_train_df['Rice Crop Intensity(D=Double, T=Triple)'])
raw_train_df = raw_train_df.drop(columns=['Rice Crop Intensity(D=Double, T=Triple)', 'Season(SA = Summer Autumn, WS = Winter Spring)'])
raw_train_df = pd.concat([raw_train_df, intensity], axis=1)

# Retrieve test data

In [5]:
# get VIs data
folder = 'adaptative'
test_path = f'../../data/processed/{folder}/test_vi.csv'
test_df = pd.read_csv(test_path)

# join weather data
join_test_df = test_df.merge(weather_df, how='inner', left_on=['District', 'date'], right_on=['name', 'datetime'])

# get raw data
raw_test_path = f'../../data/raw/test.csv'
raw_test_df = pd.read_csv(raw_test_path)

# preprocess Rice Crop Intensity
intensity = pd.get_dummies(raw_test_df['Rice Crop Intensity(D=Double, T=Triple)'])
raw_test_df = raw_test_df.drop(columns=['Rice Crop Intensity(D=Double, T=Triple)', 'Season(SA = Summer Autumn, WS = Winter Spring)'])
raw_test_df = pd.concat([raw_test_df, intensity], axis=1)

# Normalization

In [6]:
raw_cols = g_columns
join_cols = s_columns + m_columns

# RAW DATA #########################################################
raw_scaler = MinMaxScaler()
raw_scaler.fit(raw_train_df[raw_cols])

scaled_raw_train_df = pd.DataFrame(raw_scaler.transform(raw_train_df[raw_cols]), columns=raw_cols)
raw_train_df = raw_train_df.drop(columns=raw_cols)
raw_train_df = pd.concat([raw_train_df, scaled_raw_train_df], axis=1)

scaled_raw_test_df = pd.DataFrame(raw_scaler.transform(raw_test_df[raw_cols]), columns=raw_cols)
raw_test_df = raw_test_df.drop(columns=raw_cols)
raw_test_df = pd.concat([raw_test_df, scaled_raw_test_df], axis=1)

# JOIN DATA #########################################################
join_scaler = MinMaxScaler()
join_scaler.fit(join_train_df[join_cols])

scaled_join_train_df = pd.DataFrame(join_scaler.transform(join_train_df[join_cols]), columns=join_cols)
join_train_df = join_train_df.drop(columns=join_cols)
join_train_df = pd.concat([join_train_df, scaled_join_train_df], axis=1)

scaled_join_test_df = pd.DataFrame(join_scaler.transform(join_test_df[join_cols]), columns=join_cols)
join_test_df = join_test_df.drop(columns=join_cols)
join_test_df = pd.concat([join_test_df, scaled_join_test_df], axis=1)

In [18]:
raw_train_df.dtypes

District               object
Latitude              float64
Longitude             float64
Date of Harvest        object
Rice Yield (kg/ha)      int64
D                     float64
T                     float64
Field size (ha)       float64
dtype: object

In [28]:
row = raw_train_df.iloc[1]

district = row['District']
latitude = row['Latitude']
longitude = row['Longitude']
date_of_harvest = row['Date of Harvest']
label = row[l_column]

inputs = join_train_df[(join_train_df['District'] == district) &
                       (join_train_df['Latitude'] == latitude) &
                       (join_train_df['Longitude'] == longitude) &
                       (join_train_df['Date of Harvest'] == date_of_harvest)]

inputs['date'] = pd.to_datetime(inputs['date'], format='%d-%m-%Y')
inputs = inputs.sort_values('date').reset_index(drop=True)

s_inputs = torch.tensor(inputs[s_columns].T.values)
m_inputs = torch.tensor(inputs[m_columns].T.values)
g_inputs = torch.tensor(row[g_columns].astype('float64').values)

data = {
    'district': district, 
    'latitude': latitude, 
    'longitude': longitude, 
    'date_of_harvest': date_of_harvest,
    's_inputs': s_inputs,
    'm_inputs': m_inputs,
    'g_inputs': g_inputs,
    'label': label
}
        

In [29]:
data

{'district': 'Chau_Phu',
 'latitude': 10.50915,
 'longitude': 105.265098,
 'date_of_harvest': '15-07-2022',
 's_inputs': tensor([[0.1650, 0.3425, 0.5260, 0.4947, 0.3522, 0.2833, 0.2301, 0.2814, 0.2991,
          0.2539, 0.1898, 0.4081, 0.5066, 0.5700, 0.3689, 0.3933, 0.5591, 0.5829,
          0.1939, 0.5844, 0.3217, 0.2996, 0.5512, 0.4725],
         [0.1650, 0.3425, 0.5260, 0.4947, 0.3522, 0.2833, 0.2300, 0.2814, 0.2990,
          0.2538, 0.1897, 0.4080, 0.5066, 0.5700, 0.3688, 0.3933, 0.5591, 0.5829,
          0.1939, 0.5844, 0.3217, 0.2996, 0.5512, 0.4724],
         [0.7830, 0.7829, 0.7830, 0.7830, 0.7830, 0.7829, 0.7830, 0.7830, 0.7830,
          0.7830, 0.7830, 0.7845, 0.7831, 0.7830, 0.7830, 0.7829, 0.7830, 0.7830,
          0.7830, 0.7830, 0.7830, 0.7830, 0.7830, 0.7824],
         [0.1272, 0.1271, 0.1271, 0.1271, 0.1271, 0.1270, 0.1275, 0.1271, 0.1271,
          0.1271, 0.1274, 0.1272, 0.1272, 0.1272, 0.1272, 0.1272, 0.1272, 0.1272,
          0.1273, 0.1272, 0.1271, 0.1270, 0.127