In [1]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from matplotlib import pyplot as plt


In [2]:

# Load data
df = pd.read_parquet('snotel_data.parquet')
df = df[df.station_triplet == '642:WA:SNTL']
df.head()

Unnamed: 0,date,snow_depth,B01,B02,B03,B04,B05,B06,B07,B08,B09,B10,is_snow,granule_id,station_triplet
2432,2013-04-14T18:57:52.232Z,119,5757,5671,5544,5451,5809,1693,1491,-9999,91,-966,0,G2246987419-LPCLOUD,642:WA:SNTL
2433,2013-04-30T18:57:49.955Z,105,3946,3779,3500,3290,3616,1335,1154,-9999,188,-1305,0,G2246948039-LPCLOUD,642:WA:SNTL
2434,2013-05-16T18:58:02.239Z,68,1114,1163,1473,1459,1748,180,119,-9999,16,295,0,G2244110989-LPCLOUD,642:WA:SNTL
2435,2013-04-23T18:51:38.326Z,111,6032,5995,5623,5493,5559,1518,1601,-9999,278,-1610,0,G2246987880-LPCLOUD,642:WA:SNTL
2436,2013-04-14T18:57:28.350Z,119,5742,5656,5525,5432,5793,1687,1485,-9999,91,-966,0,G2246987473-LPCLOUD,642:WA:SNTL


In [3]:

# Load input and output columns from data frame
input_columns = [f"B{i:02d}" for i in range(1, 11)] + ["is_snow"]
input_data = df[input_columns].values
output_columns = ['snow_depth']
output_data = df[output_columns].values


In [4]:
# Scale data
X_scaler = MinMaxScaler()
X_scaled = X_scaler.fit_transform(input_data).astype(np.float32)
Y_scaler = MinMaxScaler()
Y_scaled = Y_scaler.fit_transform(output_data).astype(np.float32)

In [5]:

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, Y_scaled, test_size=0.33, random_state=42)


# Train models

## Linear model



In [14]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
linear_y_pred = linear_model.predict(X_test)
linear_pearsonr = pearsonr(linear_y_pred, y_test)
linear_pearsonr

PearsonRResult(statistic=array([0.5977261], dtype=float32), pvalue=array([5.71165855e-60]))

## Random Forest model

In [15]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
random_forest_y_pred = rf_model.predict(X_test)
random_forest_pearsonr = pearsonr(random_forest_y_pred, y_test[:,0])
random_forest_pearsonr

  return fit_method(estimator, *args, **kwargs)


PearsonRResult(statistic=np.float64(0.8651354035383223), pvalue=np.float64(3.6110275112210146e-183))