In [1]:
import pytorch_lightning as pl
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
from pyarrow import fs
import pyarrow.parquet as pq
from sklearn.metrics import f1_score
from pytorch_lightning.trainer import Trainer
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

## DataLoader


In [72]:
train_data  = f"./data/lcv_pasture_classif.matrix.train_2000..2020_brazil.eumap_summer.school.2022.pq"
val_data = f"./data/lcv_pasture_classif.matrix.val_2000..2020_brazil.eumap_summer.school.2022.pq"
test_data = f"./data/lcv_pasture_classif.matrix.test_2000..2020_brazil.eumap_summer.school.2022.pq"

In [73]:
train_data = pq.ParquetDataset(train_data).read().to_pandas()
val_data = pq.ParquetDataset(val_data).read().to_pandas()
test_data = pq.ParquetDataset(test_data).read().to_pandas()

In [51]:
target_col = 'class'
label_col = 'class_label'

In [52]:
cov_idx = (list(train_data.columns).index(label_col) + 1)
covs = train_data.columns[cov_idx:]
print(f'There are {len(covs)} features available to the model')

There are 364 features available to the model


In [53]:
standard_scaler = StandardScaler()
standard_scaler.fit(train_data[covs].to_numpy())

In [54]:
class Raster_data(Dataset):
    def __init__(self,data_type="train",columns=covs,scaler=standard_scaler):
        super(Raster_data,self).__init__()
        self.data_type = data_type
        self.data = (train_data if data_type == "train" else
             val_data if data_type == "val" else test_data)
        self.columns = columns
        self.scaler = scaler
        
    def __len__(self) :
        return len(self.data)
    
    def __getitem__(self,idx):
        return ((np.squeeze(self.scaler.transform(self.data[self.columns].iloc[[idx]].to_numpy()))),
                    self.data[target_col].iloc[[idx]].to_numpy() if self.data_type != "test" else torch.randn((10,10)))
    

In [55]:
raster_train = Raster_data(data_type="train")
raster_train_loader = DataLoader(raster_train,batch_size=len(raster_train))

raster_val = Raster_data(data_type="val")
raster_val_loader = DataLoader(raster_val,batch_size=len(raster_val))

raster_test = Raster_data(data_type="test")
raster_test_loader = DataLoader(raster_test,batch_size=len(raster_test))

In [9]:
class MLP(nn.Module):
    def __init__(self,input_layer=364,output_layer=3):
        super(MLP,self).__init__()
        self.backbone_layer = nn.Sequential(
                        nn.Linear(input_layer,32),
                        nn.BatchNorm1d(32),
                        nn.ReLU(),
                        nn.Linear(32,16),
                        nn.BatchNorm1d(16),
                        nn.ReLU())
        
        self.classifier_layer = nn.Linear(16,3)
        
    def forward(self,x):
        x = self.backbone_layer(x)
        x = self.classifier_layer(x)        
        return x
    
    def last_layer_representation(self,x):
        return self.backbone_layer(x)
                        

In [18]:
class MLP_lightning(pl.LightningModule):
    def __init__(self,mlp):
        super(MLP_lightning,self).__init__()
        self.mlp = MLP()
        self.criterion_loss = nn.CrossEntropyLoss(weight=torch.Tensor([0.48545972, 1.43865169, 3.63234043])) 
    
    def training_step(self,batch,batch_idx):
        X,y = batch
        pred_y = self.mlp(X)
        training_loss = self.criterion_loss(pred_y,torch.squeeze(y)-1)
        regularize_loss = 0 
        for params in mlp_module.parameters():
            regularize_loss += torch.sum(torch.abs(params))
        self.log_dict({'train_loss' : training_loss})
        return training_loss + 0.001*regularize_loss
    
    def validation_step(self,batch,batch_idx):
        X,y = batch
        pred_y = torch.argmax(self.mlp(X),axis=1)
        self.log_dict({'average_f1_score' : 
                       f1_score(torch.squeeze(y).cpu().numpy(),
                                pred_y.cpu().numpy(),
                                average='macro')})

        
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer
           

In [19]:
mlp_module = MLP_lightning(MLP())
trainer = Trainer(gpus=0,max_epochs=100)

GPU available: True, used: False
TPU available: None, using: 0 TPU cores


In [20]:
trainer.fit(mlp_module,raster_train_loader,raster_val_loader)


  | Name           | Type             | Params
----------------------------------------------------
0 | mlp            | MLP              | 12.4 K
1 | criterion_loss | CrossEntropyLoss | 0     
----------------------------------------------------
12.4 K    Trainable params
0         Non-trainable params
12.4 K    Total params


Validation sanity check: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

1

In [21]:
mlp_module.eval()

MLP_lightning(
  (mlp): MLP(
    (backbone_layer): Sequential(
      (0): Linear(in_features=364, out_features=32, bias=True)
      (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): Linear(in_features=32, out_features=16, bias=True)
      (4): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): ReLU()
    )
    (classifier_layer): Linear(in_features=16, out_features=3, bias=True)
  )
  (criterion_loss): CrossEntropyLoss()
)

In [22]:
for idx,data  in enumerate(raster_val_loader):
    x,y = data
    y_pred = torch.argmax(mlp_module.mlp(x),axis=1)

In [23]:
print(classification_report(y-1,
                            y_pred))

              precision    recall  f1-score   support

           0       0.92      0.34      0.50       205
           1       0.62      0.11      0.19       138
           2       0.76      0.99      0.86       824

    accuracy                           0.77      1167
   macro avg       0.77      0.48      0.51      1167
weighted avg       0.77      0.77      0.72      1167



## Random Forest over MLP

In [24]:
from sklearn.ensemble import RandomForestClassifier

In [56]:
for idx,train_data in enumerate(raster_train_loader):
    X_train,y_train = train_data

for idx,val_data in enumerate(raster_val_loader):
    X_val,y_val = val_data
    
for idx,test_data in enumerate(raster_test_loader):
    X_test,_ = test_data

In [57]:
x_train_repr = mlp_module.mlp.last_layer_representation(X_train).detach().numpy()

In [58]:
rf = RandomForestClassifier(random_state=1989)
rf.fit(x_train_repr,y_train)

  rf.fit(x_train_repr,y_train)


In [59]:
x_val_repr = mlp_module.mlp.last_layer_representation(X_val).detach().numpy()

In [60]:
y_pred = rf.predict(x_val_repr)

In [61]:
print(classification_report(y_val,
                    y_pred))

              precision    recall  f1-score   support

           1       0.75      0.68      0.71       205
           2       0.67      0.45      0.54       138
           3       0.86      0.93      0.89       824

    accuracy                           0.83      1167
   macro avg       0.76      0.69      0.71      1167
weighted avg       0.82      0.83      0.82      1167



In [64]:
X_test_repr = mlp_module.mlp.last_layer_representation(X_test).detach().numpy()

In [65]:
y_pred = rf.predict(X_test_repr)

In [75]:
result = pd.DataFrame({'pred' : y_pred,
                        'id' : test_data.index})

In [76]:
result

Unnamed: 0,pred,id
0,3,147396
1,3,147591
2,3,147597
3,3,147603
4,3,147609
...,...,...
1306,3,898573
1307,3,898579
1308,3,898706
1309,3,898713


In [77]:
result.to_csv("random_forest_over_mlp.csv")