In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.cross_decomposition import PLSRegression

In [None]:
# import sklearn
# sklearn.metrics.get_scorer_names()

In [None]:
folder = 'adaptative'
vis = ['ndvi', 'savi', 'evi', 'rep', 'osavi', 'rdvi', 'mtvi1', 'lswi']
min_score = 1000

for vi in vis:
    train_path = f'../../data/processed/{folder}/train_vi.csv'
    train_df = pd.read_csv(train_path)
    train_df['date'] = pd.to_datetime(train_df['date'], format='%d-%m-%Y')
    train_df['rank'] = train_df.sort_values(['date']).groupby(['Latitude', 'Longitude', 'Date of Harvest', 'Rice Yield (kg/ha)']).cumcount() + 1
    train_df = train_df.pivot_table(vi, ['Latitude', 'Longitude', 'Date of Harvest', 'Rice Yield (kg/ha)'], 'rank').reset_index()
    y_train = train_df['Rice Yield (kg/ha)'].to_numpy()
    train_df = train_df.drop(columns=['Latitude', 'Longitude', 'Date of Harvest', 'Rice Yield (kg/ha)'])
    X_train = train_df.to_numpy()

    loo = LeaveOneOut()
    for i in range(1, X_train.shape[1]+1):
        model = PLSRegression(n_components=i)
        score = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=loo, n_jobs=-1)
        score = np.absolute(np.mean(score))
        
        if score < min_score:
            min_score = score
            print(f'{vi} / n_components = {i} / score = {score:.3f} <<<')
        else:
            print(f'{vi} / n_components = {i} / score = {score:.3f}')


KeyboardInterrupt



In [5]:
best_vi = 'lswi'
n_components = 2

train_path = f'../../data/processed/{folder}/train_vi.csv'
train_df = pd.read_csv(train_path)
train_df['date'] = pd.to_datetime(train_df['date'], format='%d-%m-%Y')
train_df['rank'] = train_df.sort_values(['date']).groupby(['Latitude', 'Longitude', 'Date of Harvest', 'Rice Yield (kg/ha)']).cumcount() + 1
train_df = train_df.pivot_table(best_vi, ['Latitude', 'Longitude', 'Date of Harvest', 'Rice Yield (kg/ha)'], 'rank').reset_index()
y_train = train_df['Rice Yield (kg/ha)'].to_numpy()
train_df = train_df.drop(columns=['Latitude', 'Longitude', 'Date of Harvest', 'Rice Yield (kg/ha)'])
X_train = train_df.to_numpy()

test_path = f'../../data/processed/{folder}/test_vi.csv'
test_df = pd.read_csv(test_path)
test_df['date'] = pd.to_datetime(test_df['date'], format='%d-%m-%Y')
test_df['rank'] = test_df.sort_values(['date']).groupby(['Latitude', 'Longitude', 'Date of Harvest']).cumcount() + 1
test_df = test_df.pivot_table(best_vi, ['Latitude', 'Longitude', 'Date of Harvest'], 'rank').reset_index()
test_df = test_df.sort_values(['Latitude', 'Longitude', 'Date of Harvest'])
test_df = test_df.drop(columns=['Latitude', 'Longitude', 'Date of Harvest'])
X_test = test_df.to_numpy()

model = PLSRegression(n_components=n_components)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

test_path = '../../data/raw/test.csv'
test_df = pd.read_csv(test_path)
test_df = test_df.sort_values(['Latitude', 'Longitude', 'Date of Harvest'])
test_df['Predicted Rice Yield (kg/ha)'] = y_pred
test_df.to_csv('submission.csv', index=False)