In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
csv_file = '../data/RtmSimulation_kickstart.csv'
dataframe = pd.read_csv(csv_file, header=0, sep=',', index_col=0)
print(dataframe.columns)
print(dataframe.isnull().sum().sum())
nan_result = dict(dataframe.isnull().sum())
nan_result = {k: nan_result[k] for k in nan_result if nan_result[k] > 0}
print(f'Nan values distribution statistics: {nan_result}')

Index(['lai', 'wetness', 'treeSpecies', 'Sentinel_2A_492.4',
       'Sentinel_2A_559.8', 'Sentinel_2A_664.6', 'Sentinel_2A_704.1',
       'Sentinel_2A_740.5', 'Sentinel_2A_782.8', 'Sentinel_2A_832.8',
       ...
       'w2491', 'w2492', 'w2493', 'w2494', 'w2495', 'w2496', 'w2497', 'w2498',
       'w2499', 'w2500'],
      dtype='object', length=2114)
66
Nan values distribution statistics: {'Sentinel_2A_704.1': 10, 'Sentinel_2A_740.5': 10, 'Sentinel_2A_782.8': 10, 'w469': 10, 'w470': 5, 'w471': 5, 'w473': 8, 'w474': 8}


In [3]:
columns = ['w2491', 'w2492', 'w2493', 'w2494', 'w2495', 'w2496', 'w2497', 'w2498']
dataframe[columns]

Unnamed: 0_level_0,w2491,w2492,w2493,w2494,w2495,w2496,w2497,w2498
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0.037392,0.026100,0.033480,0.027219,0.033905,0.017797,0.038259,0.020111
2,0.028002,0.030901,0.039650,0.029524,0.032461,0.030735,0.023527,0.029523
3,0.015169,0.020155,0.025696,0.026504,0.028994,0.031195,0.025666,0.032225
4,0.029487,0.031408,0.032888,0.029878,0.027617,0.034964,0.031996,0.032882
5,0.024510,0.023754,0.026276,0.031367,0.039625,0.036997,0.022577,0.039619
...,...,...,...,...,...,...,...,...
996,0.022563,0.031964,0.031600,0.040257,0.037817,0.019517,0.027787,0.024043
997,0.027160,0.020422,0.017963,0.020281,0.017511,0.016557,0.017318,0.026807
998,0.027540,0.029444,0.033399,0.023800,0.029792,0.035015,0.027271,0.022583
999,0.021879,0.025854,0.019384,0.027878,0.017346,0.026862,0.024057,0.027240


In [3]:
# drop unrelated columns
if 'treeSpecies' in dataframe.columns:
    dataframe = dataframe.drop(columns=['treeSpecies'], axis=1)
values = {k: dataframe[k].mean() for k in nan_result.keys()}
dataframe.fillna(values, inplace=True) # change dataframe inplace, fill na values with mean value
# sentinel columns number: 10, wavelength columns number: 2101

In [4]:
from sklearn.manifold import TSNE

use_another_tsne = False
if use_another_tsne:
    print('Using another tsne method')
    tsne_model = TSNE(n_components=3, n_jobs=4, random_state=0)
    embeddings = tsne_model.fit_transform(dataframe.iloc[:, 2:])
    print(embeddings.shape)
else:
    print('Using default tsne method')
    tsne_model = TSNE(n_components=3, learning_rate='auto', init='random', perplexity=30, random_state=0)
    wavelength_feature_data = dataframe.iloc[:, 12:]
    wavelength_embeddings = tsne_model.fit_transform(wavelength_feature_data)
    print(wavelength_embeddings.shape)

    sentinel_feature_data = dataframe.iloc[:, 2:12]
    tsne_model.set_params(n_components=3, perplexity=5)
    sentinel_embeddings = tsne_model.fit_transform(sentinel_feature_data)
    print(sentinel_embeddings.shape)

    embeddings = np.concatenate((sentinel_embeddings, wavelength_embeddings), axis=1)
    # tsne_model.set_params(n_components=2, perplexity=10)
    # embeddings = tsne_model.fit_transform(embeddings)
    # print(embeddings.shape)

wetness_features = dataframe.iloc[:, 1].values
wetness_features = np.expand_dims(wetness_features, axis=1)
embeddings = np.concatenate((embeddings, wetness_features), axis=1)
print(embeddings.shape)

labels = dataframe.iloc[:, 0].values
labels = np.expand_dims(labels, axis=1)


Using default tsne method
(1000, 3)
(1000, 3)
(1000, 7)


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

train_embeddings, test_embeddings, train_labels, test_labels = train_test_split(embeddings, labels, test_size=0.2, random_state=42)

m = LinearRegression()
m.fit(train_embeddings, train_labels)
print('Training accuracy using linear regression:', m.score(train_embeddings, train_labels))
print('Testing accuracy using linear regression:', m.score(test_embeddings, test_labels))

m = RandomForestRegressor(n_estimators=100, max_depth=100, random_state=42)
m.fit(train_embeddings, train_labels)
print('Training accuracy using random forest:', m.score(train_embeddings, train_labels))
print('Testing accuracy using random forest:', m.score(test_embeddings, test_labels))

m = MLPRegressor(hidden_layer_sizes=(1000, 100, 10), max_iter=1000, random_state=42, early_stopping=True)
m.fit(train_embeddings, train_labels)
print('Training accuracy using MLP:', m.score(train_embeddings, train_labels))
print('Testing accuracy using MLP:', m.score(test_embeddings, test_labels))


# Configurations:
# Perplexity: 30, 10, 15, 40
# use_another_tsne = False, sentinel_feature_embedding n_components=3, wavelength_feature_embedding n_components=3 -> MLP regressor: 0.903
# use_another_tsne = False, sentinel_feature_embedding n_components=2, wavelength_feature_embedding n_components=3 -> RandomForest regressor: 0.893
# use_another_tsne = False, sentinel_feature_embedding n_components=1, wavelength_feature_embedding n_components=3 -> RandomForest regressor: 0.895, MLP: 0.881

Training accuracy using linear regression: 0.8275324923664742
Testing accuracy using linear regression: 0.7895184061988203


  return fit_method(estimator, *args, **kwargs)


Training accuracy using random forest: 0.9894795157885877
Testing accuracy using random forest: 0.9026077986480521


  y = column_or_1d(y, warn=True)


0.8997756721772292
Training accuracy using MLP: 0.9413370581643905
Testing accuracy using MLP: 0.8878431381927037
