In [1]:
from pyarrow import fs
import pyarrow.parquet as pq
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
scheme = 'https'
endpoint = "s3.eu-central-1.wasabisys.com"
bucket = 'global-pastures'
httpfs = fs.S3FileSystem(scheme=scheme, endpoint_override=endpoint)

train_path = 'samples/lcv_pasture_classif.matrix.train_2000..2020_brazil.eumap_summer.school.2022.pq'
val_path = 'samples/lcv_pasture_classif.matrix.val_2000..2020_brazil.eumap_summer.school.2022.pq'
test_path = 'samples/lcv_pasture_classif.matrix.test_2000..2020_brazil.eumap_summer.school.2022.pq'

In [3]:
print("Reading data")

train_data = pq.ParquetDataset(f'{bucket}/{train_path}', filesystem=httpfs).read().to_pandas()
print(f' Train: {train_data.shape}')

val_data = pq.ParquetDataset(f'{bucket}/{val_path}', filesystem=httpfs).read().to_pandas()
print(f' Val: {val_data.shape}')

test_data = pq.ParquetDataset(f'{bucket}/{test_path}', filesystem=httpfs).read().to_pandas()
print(f' Test: {test_data.shape}')

Reading data
 Train: (1552, 373)
 Val: (1167, 373)
 Test: (1311, 370)


In [4]:
target_col = 'class'
label_col = 'class_label'
train_data[label_col] = train_data[target_col].astype(str).str.cat(train_data[label_col], sep = "-")
print("Training samples per class:\n")
print(train_data[label_col].value_counts())

Training samples per class:

3-Others                           1055
1-Seeded grass                      356
2-Natural or semi-natural grass     141
Name: class_label, dtype: int64


In [5]:
cov_idx = (list(train_data.columns).index(label_col) + 1)
covs = train_data.columns[cov_idx:]
print(f'There are {len(covs)} features available to the model')

There are 364 features available to the model


- Eliminate correlated features
- Backward feature selection
- Forward feature selection

# Eliminate correlated features

In [6]:
corr_matrix = train_data[covs].corr().abs() # Working on the X training data, no target
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.6)] #Orig 0.95
post_train_no_corr = train_data[covs].drop(to_drop, axis=1)
post_val_no_corr = val_data[covs].drop(to_drop, axis=1)
post_test_no_corr = test_data[covs].drop(to_drop, axis=1)
print("Number of remaining features: ", len(post_train_no_corr.columns), len(post_val_no_corr.columns))

Number of remaining features:  27 27


In [7]:
X = post_train_no_corr#.to_numpy()
y = train_data[target_col]#.to_numpy()

rf = RandomForestClassifier(random_state=1989, class_weight='balanced_subsample')
rf.fit(X, y)

In [8]:
val_expe = val_data[target_col]
val_pred = rf.predict(post_val_no_corr)

print(classification_report(val_expe, val_pred, target_names=sorted(list(train_data['class_label'].unique()))))

                                 precision    recall  f1-score   support

                 1-Seeded grass       0.82      0.58      0.68       205
2-Natural or semi-natural grass       0.75      0.46      0.57       138
                       3-Others       0.83      0.95      0.89       824

                       accuracy                           0.83      1167
                      macro avg       0.80      0.66      0.71      1167
                   weighted avg       0.82      0.83      0.81      1167



In [9]:
report = classification_report(val_expe, val_pred, 
                      target_names=sorted(list(train_data['class_label'].unique())), 
                      output_dict=True)
report["macro avg"]["f1-score"]

0.7136623521253127

# Feature importance

In [10]:
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)
forest_importances = pd.Series(importances, index=post_train_no_corr.columns)
#fig, ax = plt.subplots()
#forest_importances.plot.bar(yerr=std, ax=ax)
#ax.set_title("Feature importances using MDI")
#ax.set_ylabel("Mean decrease in impurity")
#fig.tight_layout()

# Backward Feature Selection

In [11]:
x_t = post_train_no_corr.copy()
x_v = post_val_no_corr.copy()
benchmark_score = report["macro avg"]["f1-score"] # Including all features
imp = forest_importances.sort_values()
score = []
dropped_columns = []
for droppy in imp.index: #13 is the highest score for 0.95 but this needs to be automated
    x_t = x_t.drop(droppy, axis=1)
    x_v = x_v.drop(droppy, axis=1)
    if len(x_t.columns) < 1:
        break
    rf = RandomForestClassifier(random_state=1989, class_weight='balanced_subsample')
    rf.fit(x_t, y)
    val_pred = rf.predict(x_v)
    cr = classification_report(val_expe, val_pred, 
                               target_names=sorted(list(train_data['class_label'].unique())), 
                               output_dict=True)
    score.append(cr["macro avg"]["f1-score"])
    dropped_columns.append(droppy)
    #print(cr["macro avg"]["f1-score"])
  

Take best model found by backward feature selection

In [12]:
idx_drop = np.argmax(score)
#print(imp.index[0:idx_drop])
x_t = post_train_no_corr.copy()
x_train = x_t.drop(imp.index[0:idx_drop], axis=1)
x_v = post_val_no_corr.copy()
x_val = x_v.drop(imp.index[0:idx_drop], axis=1)
print("Final number of columns:", len(x_train.columns))

Final number of columns: 21


In [13]:
X=x_train
rf = RandomForestClassifier(random_state=1989, class_weight='balanced_subsample')
rf.fit(X, y)

In [14]:
val_pred = rf.predict(x_val)

print(classification_report(val_expe, val_pred, target_names=sorted(list(train_data['class_label'].unique()))))

                                 precision    recall  f1-score   support

                 1-Seeded grass       0.79      0.60      0.68       205
2-Natural or semi-natural grass       0.73      0.54      0.62       138
                       3-Others       0.85      0.94      0.89       824

                       accuracy                           0.83      1167
                      macro avg       0.79      0.69      0.73      1167
                   weighted avg       0.82      0.83      0.82      1167



In [15]:
x_test = post_test_no_corr.drop(imp.index[0:idx_drop], axis=1)
test_pred = rf.predict(x_test)
result = pd.DataFrame({ 'pred':test_pred, 'id': test_data.index })
result

Unnamed: 0,pred,id
0,3,147396
1,3,147591
2,3,147597
3,3,147603
4,3,147609
...,...,...
1306,3,898573
1307,3,898579
1308,3,898706
1309,3,898713


Finally to store the results execute:

In [16]:
result.to_csv('scarlet_ankit.csv')