# Assignment "FL Main Flavors"

## 1. Preparation

### 1.1 Libraries

In [None]:
import numpy as np 
import pandas as pd 
from datetime import datetime
import matplotlib.pyplot as plt 
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import mean_squared_error

# We will use networx objects to store empircial graphs, local datasets and models
import networkx as nx 
from sklearn.neighbors import kneighbors_graph
from numpy import linalg as LA

### 1.2 Helper functions

In [None]:
# The function generates a scatter plot of nodes (=FMI stations) using 
# latitude and longitude as coordinates. 
def plotFMI(G_FMI):
    num_stations = len(G_FMI.nodes)
    colors = np.array(['black', 'green', 'red', 'brown', 'deeppink',
                        'blue', 'olive', 'gray', 'orange', 'purple'])
    coords = [G_FMI.nodes[i]['coord'] for i in range(num_stations)]
    df_coords = pd.DataFrame(coords,columns=['latitude','longitude'])
    coords = np.hstack((df_coords["latitude"].to_numpy().reshape(-1,1),df_coords["longitude"].to_numpy().reshape(-1,1)))
    # Create a plot
    fig, ax = plt.subplots()
    # Draw nodes
    for node in G_FMI.nodes:
        color = colors[G_FMI.nodes[node]['cluster']]
        ax.scatter(coords[node,1], coords[node,0], color=color, s=4, zorder=5)  # zorder ensures nodes are on top of edges
        ax.text(coords[node,1]+0.1, coords[node,0]+0.2, str(node), fontsize=8, ha='center', va='center', color=color, fontweight='bold')
    # Draw edges
    for edge in G_FMI.edges:
        ax.plot([coords[edge[0],1],coords[edge[1],1]], [coords[edge[0],0],coords[edge[1],0]], linestyle='-', color='gray', alpha=0.5)

    ax.set_xlabel('longitude')
    ax.set_ylabel('latitude')
    ax.set_title('FMI stations')
    plt.show()
    
    
# The function connects each FMI station with 
# the nearest neighbours. 
def add_edges(graph, numneighbors=4):
    coords = [graph.nodes[i]['coord'] for i in range(num_stations)]
    df_coords = pd.DataFrame(coords,columns=['latitude','longitude'])
    coords = np.hstack((df_coords["latitude"].to_numpy().reshape(-1,1),df_coords["longitude"].to_numpy().reshape(-1,1)))
    A = kneighbors_graph(coords, numneighbors, mode='connectivity', include_self=False)
    nrnodes = len(graph.nodes)
    for iter_i in range(nrnodes): 
        for iter_ii in range(nrnodes): 
            if iter_i != iter_ii : 
                if A[iter_i,iter_ii]> 0 :
                    graph.add_edge(iter_i, iter_ii)
    return graph

# The function below extracts a feature and label from each row 
# of dataframe df. Each row is expected to hold a FMI weather 
# measurement with cols "Latitude", "Longitude", "temp", "Timestamp" 
# returns numpy arrays X, y.
def ExtractFeaureMatrixLabvelVector(data):
    nrfeatures = 7 
    nrdatapoints = len(data)
    X = np.zeros((nrdatapoints, nrfeatures))
    y = np.zeros((nrdatapoints, 1))

    # Iterate over all rows in dataframe and create corresponding feature vector and label 
    for ind in range(nrdatapoints):
        # latitude of FMI station, normalized by 100 
        lat = float(data['Latitude'].iloc[ind])/100
        # longitude of FMI station, normalized by 100
        lon = float(data['Longitude'].iloc[ind])/100
        # temperature value of the data point 
        tmp = data['temp'].iloc[ind]
        # read the date and time of the temperature measurement 
        date_object = datetime.strptime(data['Timestamp'].iloc[ind], '%Y-%m-%d %H:%M:%S')
        # Extract year, month, day, hour, and minute. Normalize these values 
        # to ensure that the features are in range [0,1].
        year = float(date_object.year)/2025
        month = float(date_object.month)/13
        day = float(date_object.day)/32
        hour = float(date_object.hour)/25
        minute = float(date_object.minute)/61
        X[ind,:] = [lat, lon, year, month, day, hour, minute]
        y[ind,:] = tmp

    return X, y

## 2 Data

### 2.1 Dataset

In [None]:
# Import the weather measurements.
data = pd.read_csv('Assignment_MLBasicsData.csv')

# We consider each temperature measurement (=a row in dataframe) as a 
# separate data point.
# Get the numbers of data points and the unique stations.
num_stations = len(data.name.unique())
num_datapoints = len(data)

### 2.2 Empirical graph

In [None]:
####################TODO####################
# TODO:
# 1. Construct the empirical graph G_FMI as a networkx.Graph() object.
# 2. Add a single node for each station.
# 3. Each node i must have the following attributes: 
#   'samplesize' - the number of measurements of the i-th weather station,
#   'name' - the name of the i-th weather station,
#   'coord' - the coordinates of the i-th weather station,
#   'X' - the feature matrix,
#   'y' - the label vector,
#   'cluster' - the index of the cluster to which node i belongs to 

raise NotImplementedError
# G_FMI = 


# Add edges between each station and its nearest neighbors.
# NOTE: the node degree might be different for different nodes.
numneighbors = 4
G_FMI = add_edges(G_FMI, numneighbors=numneighbors)
print("The empirical graph is connected:", nx.is_connected(G_FMI))

# Visualize the empirical graph.
plotFMI(G_FMI)

## 3. Model

### 3.1 Main hyperparameters

In [None]:
# Define the number of clusters and the random seed.
k = 10
seed = 4740

### 3.2 Student task #1 - K-Means with coordinates as a representation vector.

In [None]:
####################TODO####################
# TODO: 1. Create a 2-dimensional representation vector
#          with entries being the latitude and longitude 
#          of each FMI station.
#       2. Cluster the nodes of G_FMI using the Python class sklearn.cluster.KMean.
#       3. Store the cluster index in the nodes' attribute 'cluster'. 
        
raise NotImplementedError

    
# Plot the clustered graph.
plotFMI(G_FMI)

In [None]:
####################TODO####################
# TODO: 1. Compute the average temperature for each cluster.
#       2. Calculate the average (over all nodes) squared 
#          error loss (see the Lecture Notes 6.7).

raise NotImplementedError
# avg_error = 

# Print the average error.
print(f"The average squared loss over all datapoints is {avg_error}")

### 3.3 Student task #2 - K-Means with GMM parameters as a representation vector.

In [None]:
# Define the number components for the GMM. 
n_components = 2

####################TODO####################
# TODO: 1. Fit the GaussianMixture() model 
#          to each node in the G_FMI. Use
#          the pre-defined n_componentes and
#          random_state (seed) values.
#       2. Extract the parameters of the fitted
#          model.
#       3. Create a 2-dimensional representation vector 
#          of the shape (207, 114) with entries being the GMM parameters.
#       4. Cluster the nodes of G_FMI using the Python class sklearn.cluster.KMean.
#       5. Store the cluster labels in the nodes' attribute 'cluster'.
# HINT: GMM parameters can be extracted with 
#          .means_ - returns the matrix with
#                    entries being the mean vectors
#                    of each mixture component,
#          .covariances_ - returns the list of covariance matrices
#                          of each mixture component,
#          .weights_ - returns the weights of each mixture components.
#       Use .ravel() to flatten all parameters and .concatenate()
#       to stack them together. 
#       Therefore, the stacked parameters of each node have the shape (114, ).
#       The raveled parameters are in the following order: means, covariances, weights. 

raise NotImplementedError


    
# Plot the clustered graph.
plotFMI(G_FMI)

In [None]:
####################TODO####################
# TODO: 1. Compute the average temperature for each cluster.
#       2. Calculate the average (over all nodes) squared 
#          error loss (see the Lecture Notes 6.7).
# NOTE: You can copy your implementation from the cell above.

raise NotImplementedError
# avg_error = 


# Print the average error.
print(f"The average squared loss over all datapoints is {avg_error}")

### 3.4 Student task #3 - K-Means with eigenvectors of the Laplacian matrix as a representation vector.

In [None]:
####################TODO####################
# TODO: 1. Construct the Laplacian matrix of G_FMI.
#       2. Compute the eigenvalues and eigenvectors 
#          of the Laplacian matrix.
#       3. Sort both the eigenvalues and the eigenvectors
#          in ascending order.
#       4. Use the first k eigenvectors as
#          a representation vector.
#       5. Cluster the nodes of G_FMI using the Python class sklearn.cluster.KMean.
#       6. Store the cluster labels in the nodes' attribute 'cluster'.

raise NotImplementedError


# Plot the clustered graph.
plotFMI(G_FMI)

In [None]:
####################TODO####################
# TODO: 1. Compute the average temperature for each cluster.
#       2. Calculate the average (over all nodes) squared 
#          error loss (see the Lecture Notes 6.7).
# NOTE: You can copy your implementation from the cell above.

raise NotImplementedError
# avg_error = 


# Print the average error.
print(f"The average squared loss over all datapoints is {avg_error}")