# Library

In [77]:
# Native library
import math
import json
import joblib

import os
import sys
path = os.path.join(os.pardir, os.pardir) # 'crop-forecasting'
sys.path.append(path)

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import xarray as xr

# Data prepocessing
from src.features.datascaler import DatasetScaler
from sklearn.preprocessing import MinMaxScaler

import plotly.express as px

# Model
from sklearn.cluster import KMeans
import hdbscan
from umap import UMAP

# Constant

In [20]:
# Target variable
TARGET = "Rice Yield (kg/ha)"
TARGET_TEST = 'Predicted Rice Yield (kg/ha)'
S_COLUMNS = ['ndvi', 'savi', 'evi', 'rep', 'osavi', 'rdvi', 'mtvi1', 'lswi']
G_COLUMNS = ['Field size (ha)', 'Rice Crop Intensity(D=Double, T=Triple)']
M_COLUMNS = ['tempmax', 'tempmin', 'temp', 'dew', 'humidity', 'precip', 'precipprob', 'precipcover', 'windspeed', 'winddir', 
             'sealevelpressure', 'cloudcover', 'solarradiation', 'solarenergy', 'uvindex', 'moonphase', 'solarexposure']
FOLDER = 'augment_10_5'

INTERRIM_DIR = os.path.join(os.pardir, os.pardir, 'data', 'interim', FOLDER)
DATA_DIR = os.path.join(os.pardir, os.pardir, 'data', 'processed', FOLDER)
DATA_RAW_DIR = os.path.join(os.pardir, os.pardir, 'data', 'raw')

# Load Data

In [21]:
xdf = xr.open_dataset(os.path.join(DATA_DIR, 'train.nc'))
xdf_test = xr.open_dataset(os.path.join(DATA_DIR, 'test.nc'))
xdf

# Format Data

## Inverse Transform

In [22]:
def inverse_transform(xdf: xr.Dataset, target: str)->xr.Dataset:
    scaler: DatasetScaler = joblib.load(os.path.join(DATA_DIR, 'scaler_dataset.joblib'))
    xdf = scaler.inverse_transform(xdf, target)
    xdf = xr.merge([xdf[G_COLUMNS], xdf[M_COLUMNS].sel(datetime=xdf['time'], name=xdf['District']), xdf[S_COLUMNS], xdf[[target]]])
    # xdf = xdf.mean(dim="ts_aug", skipna=True)
    xdf = xdf.drop(['name', 'datetime'])
    return xdf
 
xdf = inverse_transform(xdf, TARGET)
xdf_test = inverse_transform(xdf_test, TARGET_TEST)
xdf

## Dataset TO DataFrame

In [23]:
def ds2df(xdf: xr.Dataset, target: str)->pd.DataFrame:
    df = xdf.to_dataframe()
    df.set_index(G_COLUMNS + [target], append=True, inplace=True)
    df.reset_index('state_dev', inplace=True)
    df['state_dev'] = df['state_dev'].astype(str)
    df = df.pivot(columns='state_dev')
    df.columns = df.columns.map('_'.join).str.strip('_')
    df.reset_index(G_COLUMNS, inplace=True)
    return df

df = ds2df(xdf, TARGET)
df_test = ds2df(xdf_test, TARGET_TEST)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Field size (ha),"Rice Crop Intensity(D=Double, T=Triple)",tempmax_0,tempmax_1,tempmax_10,tempmax_11,tempmax_12,tempmax_13,tempmax_14,tempmax_15,...,lswi_21,lswi_22,lswi_23,lswi_3,lswi_4,lswi_5,lswi_6,lswi_7,lswi_8,lswi_9
ts_obs,ts_aug,Rice Yield (kg/ha),Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
0,0,5500.0,3.4,3.0,32.0,32.0,30.6,33.0,32.0,32.0,31.0,30.1,...,0.163010,0.112218,0.098251,0.158630,0.166488,0.135827,0.104435,0.099455,0.116929,0.134190
0,1,5500.0,3.4,3.0,32.0,32.0,30.6,33.0,32.0,32.0,31.0,30.1,...,0.168801,0.118794,0.093320,0.136186,0.139432,0.103631,0.074214,0.073422,0.105093,0.129285
0,2,5500.0,3.4,3.0,32.0,32.0,30.6,33.0,32.0,32.0,31.0,30.1,...,0.162941,0.116879,0.096622,0.153793,0.153721,0.112736,0.078648,0.078435,0.111598,0.136364
0,3,5500.0,3.4,3.0,32.0,32.0,30.6,33.0,32.0,32.0,31.0,30.1,...,0.176617,0.121834,0.089324,0.146655,0.134005,0.109313,0.094775,0.110206,0.139783,0.164329
0,4,5500.0,3.4,3.0,32.0,32.0,30.6,33.0,32.0,32.0,31.0,30.1,...,0.169015,0.120149,0.092581,0.120360,0.110486,0.073596,0.053147,0.065832,0.110391,0.144500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
556,5,7200.0,2.3,3.0,29.3,29.0,32.0,32.0,33.0,32.3,30.0,33.0,...,0.047220,0.106940,0.157007,0.128061,0.090837,0.123718,0.228515,0.263948,0.250422,0.268180
556,6,7200.0,2.3,3.0,29.3,29.0,32.0,32.0,33.0,32.3,30.0,33.0,...,0.053781,0.115790,0.149186,0.111277,0.094897,0.104187,0.149136,0.175526,0.206703,0.228541
556,7,7200.0,2.3,3.0,29.3,29.0,32.0,32.0,33.0,32.3,30.0,33.0,...,0.044404,0.084949,0.109818,0.165097,0.184881,0.192924,0.208597,0.226370,0.251077,0.253910
556,8,7200.0,2.3,3.0,29.3,29.0,32.0,32.0,33.0,32.3,30.0,33.0,...,0.043711,0.100860,0.145523,0.075746,0.038714,0.073615,0.194912,0.244096,0.244995,0.245447


## Add Raw Data

In [24]:
def add_raw(df: pd.DataFrame, test: bool)->pd.DataFrame:
    if test:
        df_raw = pd.read_csv(os.path.join(DATA_RAW_DIR, 'test.csv'))
    else:
        df_raw = pd.read_csv(os.path.join(DATA_RAW_DIR, 'train.csv'))
    df = pd.merge(df, df_raw[['District', 'Latitude', 'Longitude', 'Date of Harvest']], right_index=True, left_on='ts_obs')
    return df

df = add_raw(df, False)
df_test = add_raw(df_test, True)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Field size (ha),"Rice Crop Intensity(D=Double, T=Triple)",tempmax_0,tempmax_1,tempmax_10,tempmax_11,tempmax_12,tempmax_13,tempmax_14,tempmax_15,...,lswi_4,lswi_5,lswi_6,lswi_7,lswi_8,lswi_9,District,Latitude,Longitude,Date of Harvest
ts_obs,ts_aug,Rice Yield (kg/ha),Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
0,0,5500.0,3.4,3.0,32.0,32.0,30.6,33.0,32.0,32.0,31.0,30.1,...,0.166488,0.135827,0.104435,0.099455,0.116929,0.134190,Chau_Phu,10.510542,105.248554,15-07-2022
0,1,5500.0,3.4,3.0,32.0,32.0,30.6,33.0,32.0,32.0,31.0,30.1,...,0.139432,0.103631,0.074214,0.073422,0.105093,0.129285,Chau_Phu,10.510542,105.248554,15-07-2022
0,2,5500.0,3.4,3.0,32.0,32.0,30.6,33.0,32.0,32.0,31.0,30.1,...,0.153721,0.112736,0.078648,0.078435,0.111598,0.136364,Chau_Phu,10.510542,105.248554,15-07-2022
0,3,5500.0,3.4,3.0,32.0,32.0,30.6,33.0,32.0,32.0,31.0,30.1,...,0.134005,0.109313,0.094775,0.110206,0.139783,0.164329,Chau_Phu,10.510542,105.248554,15-07-2022
0,4,5500.0,3.4,3.0,32.0,32.0,30.6,33.0,32.0,32.0,31.0,30.1,...,0.110486,0.073596,0.053147,0.065832,0.110391,0.144500,Chau_Phu,10.510542,105.248554,15-07-2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
556,5,7200.0,2.3,3.0,29.3,29.0,32.0,32.0,33.0,32.3,30.0,33.0,...,0.090837,0.123718,0.228515,0.263948,0.250422,0.268180,Thoai_Son,10.304295,105.270460,13-04-2022
556,6,7200.0,2.3,3.0,29.3,29.0,32.0,32.0,33.0,32.3,30.0,33.0,...,0.094897,0.104187,0.149136,0.175526,0.206703,0.228541,Thoai_Son,10.304295,105.270460,13-04-2022
556,7,7200.0,2.3,3.0,29.3,29.0,32.0,32.0,33.0,32.3,30.0,33.0,...,0.184881,0.192924,0.208597,0.226370,0.251077,0.253910,Thoai_Son,10.304295,105.270460,13-04-2022
556,8,7200.0,2.3,3.0,29.3,29.0,32.0,32.0,33.0,32.3,30.0,33.0,...,0.038714,0.073615,0.194912,0.244096,0.244995,0.245447,Thoai_Son,10.304295,105.270460,13-04-2022


## Categorical Encoding

In [25]:
def encode_categorical(df: pd.DataFrame)->pd.DataFrame:
    df['Date of Harvest'] = pd.to_datetime(df['Date of Harvest']).dt.month
    df['Date of Harvest'] = MinMaxScaler((0, 2 * math.pi)).fit_transform(df['Date of Harvest'].to_numpy().reshape(-1, 1)).reshape(-1)
    df['Sin Date of Harvest'] = np.sin(df['Date of Harvest'])
    df['Cos Date of Harvest'] = np.cos(df['Date of Harvest'])
    df.drop(columns=['Date of Harvest'], inplace=True)
    df = pd.get_dummies(df, columns=['District'], drop_first=True)
    return df

df = encode_categorical(df)
df_test = encode_categorical(df_test)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Field size (ha),"Rice Crop Intensity(D=Double, T=Triple)",tempmax_0,tempmax_1,tempmax_10,tempmax_11,tempmax_12,tempmax_13,tempmax_14,tempmax_15,...,lswi_6,lswi_7,lswi_8,lswi_9,Latitude,Longitude,Sin Date of Harvest,Cos Date of Harvest,District_Chau_Thanh,District_Thoai_Son
ts_obs,ts_aug,Rice Yield (kg/ha),Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
0,0,5500.0,3.4,3.0,32.0,32.0,30.6,33.0,32.0,32.0,31.0,30.1,...,0.104435,0.099455,0.116929,0.134190,10.510542,105.248554,-0.281733,-0.959493,0,0
0,1,5500.0,3.4,3.0,32.0,32.0,30.6,33.0,32.0,32.0,31.0,30.1,...,0.074214,0.073422,0.105093,0.129285,10.510542,105.248554,-0.281733,-0.959493,0,0
0,2,5500.0,3.4,3.0,32.0,32.0,30.6,33.0,32.0,32.0,31.0,30.1,...,0.078648,0.078435,0.111598,0.136364,10.510542,105.248554,-0.281733,-0.959493,0,0
0,3,5500.0,3.4,3.0,32.0,32.0,30.6,33.0,32.0,32.0,31.0,30.1,...,0.094775,0.110206,0.139783,0.164329,10.510542,105.248554,-0.281733,-0.959493,0,0
0,4,5500.0,3.4,3.0,32.0,32.0,30.6,33.0,32.0,32.0,31.0,30.1,...,0.053147,0.065832,0.110391,0.144500,10.510542,105.248554,-0.281733,-0.959493,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
556,5,7200.0,2.3,3.0,29.3,29.0,32.0,32.0,33.0,32.3,30.0,33.0,...,0.228515,0.263948,0.250422,0.268180,10.304295,105.270460,0.989821,-0.142315,0,1
556,6,7200.0,2.3,3.0,29.3,29.0,32.0,32.0,33.0,32.3,30.0,33.0,...,0.149136,0.175526,0.206703,0.228541,10.304295,105.270460,0.989821,-0.142315,0,1
556,7,7200.0,2.3,3.0,29.3,29.0,32.0,32.0,33.0,32.3,30.0,33.0,...,0.208597,0.226370,0.251077,0.253910,10.304295,105.270460,0.989821,-0.142315,0,1
556,8,7200.0,2.3,3.0,29.3,29.0,32.0,32.0,33.0,32.3,30.0,33.0,...,0.194912,0.244096,0.244995,0.245447,10.304295,105.270460,0.989821,-0.142315,0,1


## Concat Data

In [26]:
df = pd.concat([df, df_test], axis='index')

# Clustering

## KMeans

In [73]:
kmeans = KMeans(n_clusters=6)
pred = kmeans.fit_predict(df)

In [74]:
df_cluster = pd.DataFrame(pred.T, index=df.index, columns=['Cluster'])
df_cluster.reset_index(inplace=True)
test_bool = df_cluster[TARGET].isna()
df_cluster['Dataset'] = 'Train'
df_cluster['Dataset'][test_bool] = 'Test'
df_cluster['Cluster'] = df_cluster['Cluster'].astype(str)
fig = px.histogram(df_cluster, x='Cluster', color='Dataset', histnorm='probability', text_auto=True)
fig.update_layout(
    title='Repartition into cluster of Train/Test Dataset in percentage'
)

In [76]:
df_cluster_train = df_cluster[(df_cluster['Dataset'] == 'Train') & (df_cluster['Cluster'] == '5')].copy(deep=True)
df_cluster_train['ts_obs'] = df_cluster_train['ts_obs'].astype(str)

fig = px.histogram(df_cluster_train, x='ts_obs')
fig.update_layout(
    title='Repartition into Observation ID of the Cluster number 1'
)
fig.show()

df_cluster_train['ts_obs'] = df_cluster_train['ts_obs'].astype(int)

## UMAP

In [68]:
reducer = UMAP(n_components=3)
embedding = reducer.fit_transform(df)
embedding.shape

(6570, 3)

In [69]:
df_cluster = pd.DataFrame(embedding, index=df.index, columns=['x', 'y', 'z'])
df_cluster.reset_index(inplace=True)
test_bool = df_cluster[TARGET].isna()
df_cluster['Dataset'] = 'Train'
df_cluster['Dataset'][test_bool] = 'Test'

fig = px.scatter_3d(df_cluster, x='x', y='y', z='z', color='Dataset')
fig.update_traces(marker_size = 4)
fig.update_layout(title='Data representation after UMAP application on 3 Dimensions')
fig.show()

In [70]:
labels = hdbscan.HDBSCAN(
    min_samples=10,
    min_cluster_size=500,
).fit_predict(embedding)

Cluster
 4         1440
 2         1289
 1          951
 3          870
 5          840
 0          700
-1          480
dtype: int64

In [88]:
cluster_pred = pd.DataFrame(labels.T, index=df.index, columns=['Cluster'])
df_cluster.reset_index(inplace=True)
test_bool = df_cluster[TARGET].isna()
df_cluster['Dataset'] = 'Train'
df_cluster['Dataset'][test_bool] = 'Test'
df_cluster['Cluster'] = df_cluster['Cluster'].astype(str)
fig = px.histogram(df_cluster, x='Cluster', color='Dataset', histnorm='probability', text_auto=True)
fig.update_layout(
    title='Repartition into cluster of Train/Test Dataset in percentage'
)

In [94]:
df_cluster_train = df_cluster[(df_cluster['Dataset'] == 'Train') & (df_cluster['Cluster'] == '5')].copy(deep=True)
df_cluster_train['ts_obs'] = df_cluster_train['ts_obs'].astype(str)

fig = px.histogram(df_cluster_train, x='ts_obs')
fig.update_layout(
    title='Repartition into Observation ID of the Cluster number 1'
)
fig.show()

df_cluster_train['ts_obs'] = df_cluster_train['ts_obs'].astype(int)

In [95]:
ts_obs_0 = df_cluster_train['ts_obs'].unique()
len(ts_obs_0)

95

In [96]:
df_cluster_train = df_cluster[(df_cluster['Dataset'] == 'Train') & (df_cluster['Cluster'] == '3')].copy(deep=True)
df_cluster_train['ts_obs'] = df_cluster_train['ts_obs'].astype(str)

fig = px.histogram(df_cluster_train, x='ts_obs')
fig.update_layout(
    title='Repartition into Observation ID of the Cluster number 1'
)
fig.show()

df_cluster_train['ts_obs'] = df_cluster_train['ts_obs'].astype(int)

In [97]:
ts_obs_1 = df_cluster_train['ts_obs'].unique()
len(ts_obs_1)

73

In [102]:
ts_obs = list(df_cluster_train['ts_obs'])
index = {
    'dimensionality_reduction': 'UMAP',
    'clustering': 'HDBSCAN',
    'clusters': [
        {
            'train': {'ts_obs': df_cluster[(df_cluster['Dataset'] == 'Train') & (df_cluster['Cluster'] == '5')]['ts_obs'].unique().tolist()},
            'test': {'ts_obs': df_cluster[(df_cluster['Dataset'] == 'Test') & (df_cluster['Cluster'] == '5')]['ts_obs'].unique().tolist()}
        },
        {
            'train': {'ts_obs': df_cluster[(df_cluster['Dataset'] == 'Train') & (df_cluster['Cluster'] == '3')]['ts_obs'].unique().tolist()},
            'test': {'ts_obs': df_cluster[(df_cluster['Dataset'] == 'Test') & (df_cluster['Cluster'] == '3')]['ts_obs'].unique().tolist()}
        }
    ]
}

with open(os.path.join(DATA_DIR, 'index.json'), 'w') as f:
    json.dump(index, f)
with open(os.path.join(INTERRIM_DIR, 'index.json'), 'w') as f:
    json.dump(index, f)