In [44]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid
from minisom import MiniSom

#X_feat contains daily average Z500 for individual grid points 
#column name explanation: for example, "z50025.0220.0" means Z500 at point lat=25.0, lon=220.0 
X_feat = pd.read_csv('X_feat.csv', index_col=0)

#subsetting for all rows and all Z500 columns (excluding "day" column)
X = X_feat.iloc[:, 1:].values

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

#hyperparameter options: if you want this to run faster as you're learning the code, you can just remove the number of search options for each variable
param_grid = {
    'x': [2,3,4],
    'y': [2,3,4],
    'learning_rate': [0.01, 0.1, 0.25, 0.5, 1],
    'sigma': [0.1, 0.5, 1.0, 1.5, 2]
}

# all combinations of hyperparameters
grid = ParameterGrid(param_grid)

# initialize variables to track the best configuration
best_params = None

best_quantization_error = float('inf')

# function to calculate quantization error: measures the average distance between each data point and its corresponding BMU in the SOM
def quantization_error(som, data):
    
    error = 0
    
    for x in data:
        
        #difference between x (from x_scaled) and weights for the winning node, then take the norm of that vector to get the total distance
        error += np.linalg.norm(x - som.get_weights()[som.winner(x)])
    
    #for each SOM, calculate the average QE
    return error / len(data)

#grid search
for params in grid:
    
    som = MiniSom(x=params['x'], y=params['y'], input_len=X_scaled.shape[1], sigma=params['sigma'], learning_rate=params['learning_rate'])
    
    som.random_weights_init(X_scaled)
    
    som.train_random(data=X_scaled, num_iteration=10000)
    
    qe = quantization_error(som, X_scaled)
    
    #best_quantization_error starts at infinity, then with each subsequent QE calculation, updates best QE & params if new QE is less than previous
    
    if qe < best_quantization_error:
        
        best_quantization_error = qe
        
        best_params = params

print(f"Best params: {best_params}, Best Quantization Error: {best_quantization_error}")

#SOM training below: uses best parameters derived from grid search 
np.random.seed(42)

n_rows = best_params['x']

n_columns = best_params['y']

sigma = best_params['sigma']

learning_rate = best_params['learning_rate']

som = MiniSom(n_rows, n_columns, X_scaled.shape[1], sigma=sigma, learning_rate=learning_rate, random_seed=42)

som.random_weights_init(X_scaled)

#add verbose = True if you want more detail
#you can also use train_batch instead, depending on what you prefer
som.train_random(X_scaled, num_iteration=10000)

#creating a best matching unit (bmu) column to assign each day to the closest matching node according to the SOM result
bmus = []

for x in X_scaled:
    
    bmu = som.winner(x)
    
    bmus.append(bmu)

X_feat['bmu'] = bmus

#get_weights() extracts the Z500 pattern for every node in the SOM (# of nodes is defined by n_rows x n_columns)
#for example: weights[0,0] will access the node in the first row & first column of the SOM (with the data i provided, weights[0,0] will have same size as # of grid points = 2080)
weights = som.get_weights()

Best params: {'learning_rate': 0.1, 'sigma': 0.1, 'x': 4, 'y': 4}, Best Quantization Error: 28.092783524302263
