In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Step 1
First, we are going to read the csv, and get a quick overlook on the basic shape of the data etc.

In [None]:
df = pd.read_csv('data/ClimateDataBasel.csv', header=None); # There is no headers, will add in next block

Below we will define the missing columns from the dataset.

In [None]:
df.columns = [
    "temp_min", 
    "temp_max", 
    "temp_avg", 
    "humidity_min", 
    "humidity_max",
    "humidity_avg", 
    "pressure_min", 
    "pressure_max", 
    "pressure_avg",
    "rain", 
    "snow", 
    "solar", 
    "wind_speed", 
    "wind_dir",
    "visibility", 
    "air_quality", 
    "ozone", 
    "uv_index"
]

Now we are going to get an idea of the general shape of the data, and see if we have any obvious abnormalities. 
We will also check if there are any features with incorrect data-types.

In [None]:
df.shape, df.columns.tolist()
df.head()
df.info()
df.describe().T

Lets next double check if there are any missing values

In [None]:
print(df.isnull().sum())  # count missing per column
print(df.info())  # check data types

Okay, it doesnt look like there are any missing  values.

Now, lets graph this data to get an idea of the distributions. 

In [None]:
plt.figure(figsize=(12,8)) 
df.hist(bins=30, figsize=(15,10), color='blue', edgecolor='black')
plt.tight_layout()

Now lets standardise and normalise our data.
In order to do this, we will be using the sci-learn StandardScaler.


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler();
scaled_data = scaler.fit_transform(df)

# now to convert back to a dataframe

scaled_df = pd.DataFrame(scaled_data, columns=df.columns)
scaled_df.head()



# normalise data

from sklearn.preprocessing import MinMaxScaler

normaliser = MinMaxScaler(feature_range=(0,1));
normalised_data = normaliser.fit_transform(df)
normalised_df = pd.DataFrame(normalised_data, columns=df.columns)
normalised_df.head()

## Outlier Detection

We are going to be using the chebyshev inequality method in order to detect outliers:

First, we use k=4, as a strict approach to detect extreme outliers. 
Then, if needed, we can relax k to 3.
my focus is removing the **smallest** amount of data points possible and only removing extreme outliers.

In [None]:
# define k 
k = 8

# get mean and standard deviation
mean = df.mean()
stdev = df.std()

# calculate zscores. These are the number of standard deviations away from the mean a data point lies.
z_scores = np.abs((df - mean) / stdev)

# if a row has any of these, mark as an outlier.
outliers = (z_scores > k).any(axis=1)


# Now, we need to display these outliers in a clear way in order to determine if they should be removed.

outlier_data = df[outliers]
print(f"Number of outliers detected: {outlier_data.shape[0]}")
print(f"Outlier ratio: {outlier_data.shape[0] / df.shape[0]:.2%}")

# check if any temp_max values are less than temp_min values
temp_anomalies = df[df['temp_max'] < df['temp_min']]
print(f"Number of temperature anomalies detected: {temp_anomalies.shape[0]}")

# check avg temp values are within min and max
temp_avg_anomalies =    df[(df['temp_avg'] < df['temp_min'])
                        | (df['temp_avg'] > df['temp_max'])]
print(f"Number of temperature average anomalies detected: {temp_avg_anomalies.shape[0]}")


# check if any humidity_max values are less than humidity_min values
humidity_anomalies = df[df['humidity_max'] < df['humidity_min']]
print(f"Number of humidity anomalies detected: {humidity_anomalies.shape[0]}")


# check avg humidity values are within min and max
humidity_avg_anomalies =    df[(df['humidity_avg'] < df['humidity_min'])
                            | (df['humidity_avg'] > df['humidity_max'])]
print(f"Number of humidity average anomalies detected: {humidity_avg_anomalies.shape[0]}")

# check if any pressure_max values are less than pressure_min values
pressure_anomalies = df[df['pressure_max'] < df['pressure_min']]
print(f"Number of pressure anomalies detected: {pressure_anomalies.shape[0]}")

# check avg pressure values are within min and max
pressure_avg_anomalies =    df[(df['pressure_avg'] < df['pressure_min'])
                            | (df['pressure_avg'] > df['pressure_max'])]
print(f"Number of pressure average anomalies detected: {pressure_avg_anomalies.shape[0]}")

outlier_data.head(10)



72 Outliers were detected, which is around 4.08% of the total data set.

In [None]:
# draw scatter graphs of all the features, # highlighting the outliers in red
# small dots
# show them on one image, but a large one


plt.figure(figsize=(20, 15))
for i, column in enumerate(df.columns):
    plt.subplot(4, 5, i + 1)
    plt.scatter(df.index, df[column], c=outliers.map({True: 'red', False: 'blue'}), s=10)
    plt.title(column)
    plt.xlabel('Index')
    plt.ylabel(column)
plt.tight_layout()
plt.show()


After looking through these outliers manually, all of these seem normal. And do not seem like erroneous values that need to be removed. In a weather system especially, outliers or extreme weather events are events of extreme importance and the largest amount of interest. so unless there is a faulty reading, there is no need to remove data.

Next, in order to begin clustering this data, we must remove any fields that may confuse the clustering process.

In [None]:
# correlaton grid 

import numpy as np

plt.figure(figsize=(12, 10))
correlation_matrix = df.corr()

# Create heatmap using matplotlib
im = plt.imshow(correlation_matrix, cmap='coolwarm', aspect='auto', vmin=-1, vmax=1)

# Add colorbar
plt.colorbar(im)

# Set ticks and labels
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=90)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)

# Annotate cells with correlation values
for i in range(len(correlation_matrix)):
    for j in range(len(correlation_matrix.columns)):
        plt.text(j, i, f"{correlation_matrix.iloc[i, j]:.2f}",
                ha='center', va='center', color='black', fontsize=8)

plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

Now, obviously we can see that the min max and avg variables are redundant for this type of analysis and provide a lot of collinearity. So we shall now do this again with the min and max removed.
We may change this if other interesting correlations are found

In [None]:
# make another correlation grid without min and max ones

revised_features = [
    "temp_avg", 
    "humidity_avg", 
    "pressure_avg", 
    "rain", 
    "snow", 
    "solar", 
    "wind_speed",
    "visibility", 
    "air_quality", 
    "ozone", 
    "uv_index"
]

plt.figure(figsize=(12, 10))
correlation_matrix = df[revised_features].corr()

# Create heatmap using matplotlib
im = plt.imshow(correlation_matrix, cmap='coolwarm', aspect='auto', vmin=-1, vmax=1)

# Add colorbar
plt.colorbar(im)

# Set ticks and labels
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=90)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)

# Annotate cells with correlation values
for i in range(len(correlation_matrix)):
    for j in range(len(correlation_matrix.columns)):
        plt.text(j, i, f"{correlation_matrix.iloc[i, j]:.2f}",
                ha='center', va='center', color='black', fontsize=8)

plt.title('Correlation Matrix (Revised Features)')
plt.tight_layout()
plt.show()

Now we are going to run pca in order to reduce dimensionality and remove redundancy.


In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

pca = PCA(n_components=10)
pca_result = pca.fit_transform(scaled_data)

print("Explained variance ratio for each component:", pca.explained_variance_ratio_)
print("Number of components kept:", pca.n_components_)


plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance by Number of Principal Components')
plt.grid(True)
plt.show()

In [None]:
import pandas as pd

pc_columns = [f'PC{i+1}' for i in range(pca.n_components_)]
data_pca_df = pd.DataFrame(pca_result, columns=pc_columns)
data_pca_df.head()

Now time to run K-means clustering

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

k_values = [2, 3, 4, 5, 6, 7]
pc1 = data_pca_df['PC1'].values
pc2 = data_pca_df['PC2'].values

fig, axes = plt.subplots(1, len(k_values), figsize=(20, 4), sharex=True, sharey=True)
plt.suptitle('K-Means Clustering Across k Values', fontsize=16, y=1.02)

for j, k in enumerate(k_values):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(data_pca_df[pc_columns])  # Use 8 PCs
    ax = axes[j]
    scatter = ax.scatter(pc1, pc2, c=labels, cmap='viridis', s=2, alpha=1)
    ax.set_title(f'k={k}')
    ax.set_xlabel('PC1')
    ax.set_ylabel('PC2')
    ax.label_outer()

plt.tight_layout(rect=[0, 0.03, 1, 0.96])
plt.show()

Now running Hierarchical clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering

n_clusters_list = [2, 3, 4, 5]
linkage_list = ['ward', 'complete', 'average', 'single']

fig, axes = plt.subplots(len(linkage_list), len(n_clusters_list), figsize=(16, 12), sharex=True, sharey=True)
plt.suptitle('Agglomerative Clustering Across Parameter Grid', fontsize=16, y=1.02)

for i, linkage in enumerate(linkage_list):
    for j, n_clusters in enumerate(n_clusters_list):
        model = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage)
        labels = model.fit_predict(data_pca_df[pc_columns])  # Use all 8 PCs
        ax = axes[i, j]
        ax.scatter(pc1, pc2, c=labels, cmap='viridis', s=2, alpha=1)
        ax.set_title(f'{linkage}, n_clusters={n_clusters}')
        ax.set_xlabel('PC1')
        ax.set_ylabel('PC2')
        ax.label_outer()

plt.tight_layout(rect=[0, 0.03, 1, 0.96])
plt.show()

Now running DBSCAN

In [None]:
from sklearn.cluster import DBSCAN

eps_values = [0.3, 0.5, 0.7, 0.9, 1.1]
min_samples_values = [3, 5, 8]

fig, axes = plt.subplots(len(min_samples_values), len(eps_values), figsize=(18, 10), sharex=True, sharey=True)
plt.suptitle('DBSCAN Clustering Across Parameter Grid', fontsize=16, y=1.05)

for i, min_samples in enumerate(min_samples_values):
    for j, eps in enumerate(eps_values):
        db = DBSCAN(eps=eps, min_samples=min_samples)
        labels = db.fit_predict(data_pca_df[pc_columns])  # Use all 8 PCs
        ax = axes[i, j]
        ax.scatter(pc1, pc2, c=labels, cmap='viridis', s=2, alpha=1)
        ax.set_title(f'eps={eps}, min_samples={min_samples}\nclusters={len(set(labels))-(1 if -1 in labels else 0)}\nnoise={list(labels).count(-1)}')
        ax.set_xlabel('PC1')
        ax.set_ylabel('PC2')
        ax.label_outer()

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()