In [1]:
#Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering, KMeans, AffinityPropagation
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

from utils import *

In [2]:
# Load the dataset
biomass_data = pd.read_csv('site_data.csv')

# Transpose the DataFrame to have products as rows and locations as columns
data_transposed = biomass_data.set_index(['dataset', 'site_name', 'site_id', 'X', 'Y']).T

# Remove non-numeric rows
data_transposed_numeric = data_transposed.select_dtypes(include=[np.number])

# Replace 'NA' with NaN
data_transposed_numeric = data_transposed_numeric.replace('NA', np.nan)

# Drop rows with all NaN values
data_transposed_numeric = data_transposed_numeric.dropna(axis=1, how='all')

# Impute missing values with column mean
imputer = SimpleImputer(strategy='mean')
# Impute missing values with ROW mean (must use transposed df and axis=1 for rows)
data_imputed = data_transposed_numeric.T.fillna(data_transposed_numeric.mean(axis=1)).T

data_imputed

dataset,NEON,NEON,NEON,NEON,NEON,NEON,NEON,NEON,NEON,NEON,...,Ameriflux,Ameriflux,Ameriflux,Ameriflux,Ameriflux,Ameriflux,Ameriflux,Ameriflux,Ameriflux,Ameriflux
site_name,Harvard Forest,Smithsonian Conservation Biology Institute,Ordway-Swisher Biological Station,UNDERC,Konza Prairie Biological Station,Oak Ridge,Talladega National Forest,Woodworth,Central Plains Experimental Range,LBJ National Grassland,...,NEON Steigerwaldt Land Services (STEI),NEON Talladega National Forest (TALL),NEON Lower Teakettle (TEAK),NEON Treehaven (TREE),NEON The University of Kansas Field Station (UKFS),NEON University of Notre Dame Environmental Research Center (UNDE),NEON Woodworth (WOOD),NEON Wind River Experimental Forest (WREF),NEON Yellowstone Northern Range (Frog Rock) (YELL),Yellow Cab urban
site_id,HARV,SCBI,OSBS,UNDE,KONZ,ORNL,TALL,WOOD,CPER,CLBJ,...,US-xST,US-xTA,US-xTE,US-xTR,US-xUK,US-xUN,US-xWD,US-xWR,US-xYE,US-Ylw
X,-72.172660,-78.139500,-81.993430,-89.537250,-96.563090,-84.282600,-87.393270,-99.241356,-104.745602,-97.570000,...,-89.586400,-87.393300,-119.006000,-89.585700,-95.192100,-89.537300,-99.241400,-121.951900,-110.539100,-95.353600
Y,42.536900,38.892920,29.689270,46.233880,39.100770,35.964120,32.950460,47.128228,40.815534,33.401230,...,45.508900,32.950500,37.005800,45.493700,39.040400,46.233900,47.128200,45.820500,44.953500,29.789400
ESA CCI,21.046901,21.046901,21.046901,21.046901,21.046901,21.046901,21.046901,21.046901,21.046901,21.046901,...,21.046901,21.046901,141.225123,21.046901,21.046901,21.046901,21.046901,244.415145,7.986143,21.046901
Chopping et al.,18.783055,18.783055,18.783055,18.783055,18.783055,18.783055,18.783055,18.783055,0.0,18.783055,...,18.783055,18.783055,226.943207,18.783055,18.783055,18.783055,18.783055,18.783055,18.783055,18.783055
GEDI L4B,164.080719,334.279755,63.379155,155.069643,7.477551,162.482702,228.355491,4.161302,1.268689,17.868778,...,105.857848,228.293116,113.171759,111.140861,88.286984,155.035292,4.150213,357.726868,37.207724,7.105864
Liu et al.,37.921211,115.212837,69.890287,139.331772,10.838812,91.845345,85.784576,9.719982,9.621728,11.76426,...,128.052652,85.784576,40.415094,51.776399,27.528269,139.331772,9.7199,194.783264,59.85466,20.670036
LT-GNN,31.403246,31.403246,31.403246,31.403246,31.403246,31.403246,31.403246,31.403246,31.403246,31.403246,...,31.403246,31.403246,248.404526,31.403246,31.403246,31.403246,31.403246,31.403246,31.403246,31.403246
Menlove & Healey,114.922806,106.782768,42.734001,91.699257,10.135107,115.70589,125.903267,0.0,0.0,3.689552,...,44.224957,125.903267,176.307236,44.224957,29.088547,91.699257,0.0,320.595123,36.56377,8.727328
Xu et al.,69.325028,69.325028,69.325028,69.325028,69.325028,69.325028,69.325028,69.325028,0.0,69.325028,...,69.325028,69.325028,176.820676,69.325028,69.325028,69.325028,69.325028,819.682068,79.867683,69.325028


In [4]:
df=data_imputed

In [None]:
#Test 1: make 10 datasets based on Liu. With a noise scaling factor

# Specify the mean and standard deviation of the Gaussian noise
mean = np.mean(df.loc['Liu et al.'])
std_dev = np.std(df.loc['Liu et al.'])
var = np.var(df.loc['Liu et al.'])

# Generate random noise from a Gaussian distribution
noise = np.random.normal(mean, std_dev, df.loc['Liu et al.'].shape)

# Add the noise to the dataset
noisy_data_1 = df.loc['Liu et al.'] + noise   

In [None]:
mean

In [None]:
# Initialize the noise scale factor
noise_scale = 1.0

# Generate random noise and add it to the original data 10 times
for i in range(1, 11):
    # Scale the standard deviation based on the loop iteration
    scaled_std_dev = std_dev * noise_scale
    
    # Generate random noise with the scaled standard deviation
    noise = np.random.normal(mean, scaled_std_dev, df.loc['Liu et al.'].shape)
    
    # Add the noise to the original data
    noisy_data_Liu = df.loc['Liu et al.'] + noise

    # Construct the variable name and assign the noisy data
    var_name = f'noisy_data_{i}'
    globals()[var_name] = noisy_data
    
    # Increase the noise scale factor for the next iteration
    noise_scale += 0.5

In [None]:
n = 10  # Change this to the number of noisy data vectors you have

# Create an empty list to store the noisy data vectors
noisy_data_list = []

# Append each noisy data vector to the list
for i in range(1, n+1):
    var_name = f'noisy_data_{i}'
    noisy_data_vector = globals()[var_name]
    noisy_data_list.append(noisy_data_vector)

# Stack the noisy data vectors along the vertical axis to form the new dataset
noisy_data_dataset = np.vstack(noisy_data_list)
noisy_data_dataset

In [None]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(noisy_data_dataset)

cluster = AgglomerativeClustering(n_clusters=2, linkage='ward')
clusters = cluster.fit_predict(data_scaled)

if -1 in clusters:
    print("noise found during clustering, contained in cluster -1")

In [None]:
clusters

In [5]:
def generate_noisy_data(original_data, n_vectors):
    """
    Generate n_vectors of noisy data from the given original_data.
    
    Args:
        original_data (numpy.ndarray): The original data array.
        n_vectors (int): The number of noisy data vectors to generate.
        
    Returns:
        numpy.ndarray: A 2D array containing the n_vectors of noisy data,
                        with each row representing one noisy data vector.
    """
    mean = np.mean(original_data)
    std_dev = np.std(original_data)
    
    noisy_data_list = []
    for _ in range(n_vectors):
        noise = np.random.normal(mean, std_dev, original_data.shape)
        noisy_data = original_data + noise
        noisy_data_list.append(noisy_data)
    
    return np.vstack(noisy_data_list)

# Generate 10 vectors of noisy data from the first dataset
noisy_data_1 = generate_noisy_data(df.loc['Liu et al.'], 10)

# Generate 10 vectors of noisy data from the second dataset
noisy_data_2 = generate_noisy_data(df.loc['GEDI L4B'], 10)

# Combine the noisy data vectors from both datasets
Liu_Gedi = np.vstack((noisy_data_1, noisy_data_2))
Liu_Gedi

array([[139.30267657, 232.96131899, 119.04963722, ..., 206.62545021,
        213.68517957,  73.15109124],
       [ 82.93466021, 153.40668196, 124.18889895, ..., 262.65649041,
          3.36904635,  86.69594518],
       [ 33.87134755,  84.2327844 , 121.46424622, ..., 180.147882  ,
        100.94698919,  44.34349035],
       ...,
       [349.52722685, 487.93017236,  88.39186357, ..., 402.1479478 ,
         66.92332817, -60.98227256],
       [177.48737548, 352.67430687, 167.61578434, ..., 360.48319859,
        -24.87188334,  23.12093094],
       [145.69148822, 307.07878323,  64.21689124, ..., 441.15198891,
         23.4367843 , 209.57005299]])

In [6]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(Liu_Gedi)

cluster = AgglomerativeClustering(n_clusters=2, linkage='ward')
clusters = cluster.fit_predict(data_scaled)

if -1 in clusters:
    print("noise found during clustering, contained in cluster -1")

In [7]:
clusters

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)