## Load Packages

In [1]:
from pyarrow import fs
import pyarrow.parquet as pq
import pandas as pd
import numpy as np

## Read Data

In [2]:
train_data  = f"./data/lcv_pasture_classif.matrix.train_2000..2020_brazil.eumap_summer.school.2022.pq"
val_data = f"./data/lcv_pasture_classif.matrix.val_2000..2020_brazil.eumap_summer.school.2022.pq"
test_data = f"./data/lcv_pasture_classif.matrix.test_2000..2020_brazil.eumap_summer.school.2022.pq"

## Convert to numpy

In [3]:
train_data = pq.ParquetDataset(train_data).read().to_pandas()
val_data = pq.ParquetDataset(val_data).read().to_pandas()
test_data = pq.ParquetDataset(test_data).read().to_pandas()

In [4]:
target_col = 'class'
label_col = 'class_label'

In [5]:
cov_idx = (list(train_data.columns).index(label_col) + 1)
covs = train_data.columns[cov_idx:]
print(f'There are {len(covs)} features available to the model')

There are 364 features available to the model


In [6]:
X = train_data[covs].to_numpy()
y = train_data[target_col].to_numpy()

In [7]:
X_val = val_data[covs].to_numpy()
y_val = val_data[target_col].to_numpy()

In [18]:
X_test = test_data[covs].to_numpy()

### Define Positinal Encoding

In [8]:
def positional_encoding(data,l=4):
    num_features = data.shape[1]
    for each_l in range(l):
        data = np.concatenate((data,
                               np.sin(np.power(2,each_l)*np.pi*data),
                               np.cos(np.power(2,each_l)*np.pi*data)),axis=1)
        
    return data
        

## Run Random Forest Experiments

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [10]:
X_sin_cosin_train = positional_encoding(X)
X_sin_cosin_val = positional_encoding(X_val)
X_sin_cosin_test = positional_encoding(X_test)

In [11]:
rf = RandomForestClassifier(random_state=1989)
rf.fit(X_sin_cosin_train,y)

In [15]:
y_pred = rf.predict(X_sin_cosin_val)
print(classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           1       0.84      0.50      0.63       205
           2       0.82      0.29      0.43       138
           3       0.80      0.97      0.88       824

    accuracy                           0.81      1167
   macro avg       0.82      0.59      0.64      1167
weighted avg       0.81      0.81      0.78      1167



In [21]:
y_pred_test = rf.predict(X_sin_cosin_test)
result = pd.DataFrame({'pred' : y_pred_test,
                        'id' : test_data.index})

In [22]:
result

Unnamed: 0,pred,id
0,3,147396
1,3,147591
2,3,147597
3,3,147603
4,3,147609
...,...,...
1306,3,898573
1307,3,898579
1308,3,898706
1309,3,898713


In [23]:
result.to_csv('random_forest_with_positional_encoding.csv')