In [3]:
#basics
import pandas as pd
import numpy as np

#sklearn
from sklearn.metrics import mean_absolute_error, mean_squared_error


# Import Data

In [4]:
!pwd


/home/alish/code/amelietatin/Predicting_land_cover/notebooks


In [5]:
df=pd.read_csv("/home/alish/code/amelietatin/Predicting_land_cover/data/final_data_2015_2035.csv")

df.drop("Unnamed: 0", axis=1, inplace=True)


# Assuming your DataFrame is named df
# Ensure 'quarter_start' column is in datetime format
df['quarter_start'] = pd.to_datetime(df['quarter_start'])

# Filter the DataFrame to include only rows with 'quarter_start' on or before 2024-01-01
df = df[df['quarter_start'] <= '2024-01-01']

# Optionally, reset the index if you want a clean DataFrame
df.reset_index(drop=True, inplace=True)

df

# Basline Model: Persistence Model 

Assumes that the land cover in the next quarter will be the same as the current quarter. This is a very simple model and serves as a minimal benchmark.

## Preprocess the Data

In [10]:
data=df.copy()

In [11]:
# Sort the data by SITECODE and quarter_start
data.sort_values(by=['SITECODE', 'quarter_start'], inplace=True)

# Define feature and target columns
feature_cols = ['temperature_quarterly_mean', 'precipitation_quarterly_mean', 
                'water-vapor-pressure_quarterly_mean', 'cloud-cover_quarterly_mean']
target_cols = ['Bare Ground', 'Built Area', 'Crops', 'Flooded Vegetation', 'Grass', 
               'Shrub and Scrub', 'Snow and Ice', 'Trees', 'Water']

# Split the data into training and testing sets
train_df = data[data['quarter_start'] <= '2023-12-31']
test_df = data[data['quarter_start'] > '2023-12-31']

# Verify the splits
print("Training set size:", train_df.shape)
print("Testing set size:", test_df.shape)


Training set size: (21141, 15)
Testing set size: (620, 15)


## Create the Baseline Model

In [12]:
test_df

Unnamed: 0,SITECODE,quarter_start,Bare Ground,Built Area,Crops,Flooded Vegetation,Grass,Shrub and Scrub,Snow and Ice,Trees,Water,temperature_quarterly_mean,precipitation_quarterly_mean,water-vapor-pressure_quarterly_mean,cloud-cover_quarterly_mean
21141,AT1301000,2024-01-01,3.609320,0.636298,7.003258,1.828857,6.159897,2.985530,0.000000,69.628906,5.610794,4.79,6.275796e-09,623.55690,0.667620
21142,AT1302000,2024-01-01,0.980301,0.904842,0.369008,2.407601,2.544262,5.885582,0.000000,85.920848,0.164198,4.50,6.374565e-09,620.18150,0.660555
21143,AT1304000,2024-01-01,0.071675,2.481003,17.280700,7.671624,47.136749,1.290211,0.000000,20.411768,0.000000,4.50,6.374565e-09,620.18150,0.660555
21144,AT2112000,2024-01-01,2.699038,0.254751,0.040753,0.790104,0.202969,11.287054,21.626740,58.950608,0.821455,-0.34,8.411814e-09,550.83984,0.575970
21145,AT2116000,2024-01-01,5.414122,1.642407,7.002508,3.981015,4.022754,14.768104,47.411005,5.321971,4.323929,-0.34,8.411814e-09,550.83984,0.575970
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21756,SE0820042,2024-01-01,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,96.878517,3.121345,0.000000,-6.88,1.913455e-08,469.72640,0.780085
21757,SE0820295,2024-01-01,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,99.995652,0.000000,0.000000,-10.24,2.840418e-08,411.43936,0.868867
21758,SE0820614,2024-01-01,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,99.961524,0.024216,0.000000,-7.09,1.898806e-08,455.15042,0.755212
21759,SE0820615,2024-01-01,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,99.986081,0.000929,0.000000,-7.23,1.825365e-08,453.09683,0.751676


In [14]:
# Create a dictionary to store the last known values from the training data
last_known_values = {}

# Populate the dictionary with the last known values for each SITECODE
for site in train_df['SITECODE'].unique():
    last_known_values[site] = train_df[train_df['SITECODE'] == site].iloc[-1]

# Create a DataFrame for the predictions
predictions = []

# Generate predictions using the last known values
for idx, row in test_df.iterrows():
    site = row['SITECODE']
    if site in last_known_values:
        last_values = last_known_values[site]
        predictions.append(last_values[target_cols].values)
    else:
        predictions.append([None] * len(target_cols))

predictions_df = pd.DataFrame(predictions, columns=target_cols, index=test_df.index)

# Drop rows where predictions contain None values (for sites not present in training data)
predictions_df.dropna(inplace=True)

# Align test_df with predictions_df by the index
aligned_test_df = test_df.loc[predictions_df.index]


## Evaluate the Baseline Model

In [16]:

# Initialize lists to store error values for each target
mae_values = []
rmse_values = []

# Calculate MAE and RMSE for each target
for target in target_cols:
    mae = mean_absolute_error(aligned_test_df[target], predictions_df[target])
    rmse = np.sqrt(mean_squared_error(aligned_test_df[target], predictions_df[target]))
    mae_values.append(mae)
    rmse_values.append(rmse)
    print(f'{target}: MAE = {mae}, RMSE = {rmse}')

# Optionally, you can print average MAE and RMSE across all targets
print(f'Average MAE: {np.mean(mae_values)}, Average RMSE: {np.mean(rmse_values)}')


Bare Ground: MAE = 1.9731863729605104, RMSE = 5.051114008947766
Built Area: MAE = 0.9032599556925837, RMSE = 3.2859559475391853
Crops: MAE = 3.0460982847683686, RMSE = 7.542150481818097
Flooded Vegetation: MAE = 1.1517716914092, RMSE = 2.9262907166714354
Grass: MAE = 3.9806182141082322, RMSE = 9.173627717585047
Shrub and Scrub: MAE = 6.474417362981379, RMSE = 11.152427546066196
Snow and Ice: MAE = 26.311703977884633, RMSE = 44.55848319164119
Trees: MAE = 15.507515522413996, RMSE = 24.564193396938887
Water: MAE = 7.252103936338634, RMSE = 18.235567729786105
Average MAE: 7.4000750353952816, Average RMSE: 14.054423415221546
