In [None]:
import os

# Set the base directory
base_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Define the Data directory path
data_dir = os.path.join(base_dir, "Data")

# Define the Climate Data directory path
climate_data_dir = os.path.join(data_dir, "Climate Data")

# Data Loading and Initial Overview

In [None]:
# Load and inspect data
import pandas as pd
climate_data = pd.read_csv(os.path.join(climate_data_dir, "ClimateDataBasel.csv"), header=None)

# Add column names
climate_data.columns = [
    'Temp_Min', 'Temp_Max', 'Temp_Mean',
    'Humidity_Min', 'Humidity_Max', 'Humidity_Mean',
    'Pressure_Min', 'Pressure_Max', 'Pressure_Mean',
    'Precipitation', 'Snowfall', 'Sunshine',
    'WindGust_Min', 'WindGust_Max', 'WindGust_Mean',
    'WindSpeed_Min', 'WindSpeed_Max', 'WindSpeed_Mean'
]

print(climate_data.head())
print(climate_data.info())
print(climate_data.describe())

# Preprocessing

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Correlation matrix
correlation_matrix = climate_data.corr()

# Plot
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", annot_kws={"size": 6})
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# Check basic statistics for Snowfall
print(climate_data["Snowfall"].describe())

# Check unique values and their counts
print(climate_data["Snowfall"].value_counts())

In [None]:
# Drop redundant features
climate_data = climate_data[['Temp_Mean', 'Humidity_Mean', 'Pressure_Mean', 'Precipitation', 'WindSpeed_Mean']]

In [None]:
import numpy as np

# Detect Outliers using Z-Score
mean = climate_data.mean()
std = climate_data.std()
z_scores = (climate_data - mean) / std
outliers = np.abs(z_scores) > 3

# Remove rows with outliers
cleaned_data = climate_data[~outliers.any(axis=1)]

print(f"Original rows: {len(climate_data)}, Cleaned rows: {len(cleaned_data)}")

In [None]:
# standardisation
def standard(data):
    """Standardise the data to have mean = 0 and std = 1"""
    standardData = data.copy()
    rows, cols = data.shape
    for j in range(cols):
        sigma = np.std(data[:, j])  # Standard deviation
        mu = np.mean(data[:, j])   # Mean
        for i in range(rows):
            standardData[i, j] = (data[i, j] - mu) / sigma
    return standardData

# Apply standardisation
standardised_data = standard(cleaned_data.values)

print(standardised_data[:5])

In [None]:
import matplotlib.pyplot as plt

# Plot for 'Temp_Mean'
plt.hist(cleaned_data['Temp_Mean'], bins=25)
plt.title("Original Temp_Mean Distribution")
plt.xlabel("Temp_Mean")
plt.ylabel("Frequency")
plt.show()

plt.hist(standardised_data[:, 0], bins=25)
plt.title("Standardised Temp_Mean Distribution")
plt.xlabel("Temp_Mean")
plt.ylabel("Frequency")
plt.show()

# Plot for 'Humidity_Mean'
plt.hist(cleaned_data['Humidity_Mean'], bins=25)
plt.title("Original Humidity_Mean Distribution")
plt.xlabel("Humidity_Mean")
plt.ylabel("Frequency")
plt.show()

plt.hist(standardised_data[:, 1], bins=25)
plt.title("Standardised Humidity_Mean Distribution")
plt.xlabel("Humidity_Mean")
plt.ylabel("Frequency")
plt.show()

# Plot for 'Pressure_Mean'
plt.hist(cleaned_data['Pressure_Mean'], bins=25)
plt.title("Original Pressure_Mean Distribution")
plt.xlabel("Pressure_Mean")
plt.ylabel("Frequency")
plt.show()

plt.hist(standardised_data[:, 2], bins=25)
plt.title("Standardised Pressure_Mean Distribution")
plt.xlabel("Pressure_Mean")
plt.ylabel("Frequency")
plt.show()

# Plot for 'Precipitation'
plt.hist(cleaned_data['Precipitation'], bins=25)
plt.title("Original Precipitation Distribution")
plt.xlabel("Precipitation")
plt.ylabel("Frequency")
plt.show()

plt.hist(standardised_data[:, 3], bins=25)
plt.title("Standardised Precipitation Distribution")
plt.xlabel("Precipitation")
plt.ylabel("Frequency")
plt.show()

# Plot for 'WindSpeed_Mean'
plt.hist(cleaned_data['WindSpeed_Mean'], bins=25)
plt.title("Original WindSpeed_Mean Distribution")
plt.xlabel("WindSpeed_Mean")
plt.ylabel("Frequency")
plt.show()

plt.hist(standardised_data[:, 4], bins=25)
plt.title("Standardised WindSpeed_Mean Distribution")
plt.xlabel("WindSpeed_Mean")
plt.ylabel("Frequency")
plt.show()


# Dimensionality Reduction

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import decomposition 

# Feature extraction - PCA
pca = decomposition.PCA(n_components=2)
pca.fit(standardised_data)
pca_data = pd.DataFrame(
    pca.transform(standardised_data),
    columns=["PCA1", "PCA2"]
)

In [None]:
# Scatter plot for PCA transformed data
plt.scatter(pca_data["PCA1"], pca_data["PCA2"], alpha=0.5)
plt.title("PCA-transformed Data")
plt.xlabel("First Principal Component")
plt.ylabel("Second Principal Component")
plt.show()

# Clustering

## K-means clustering

In [None]:
# Import necessary libraries
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Elbow method for determining the optimal number of clusters
inertia_values = []  # Store inertia for each number of clusters
cluster_range = range(2, 10)  # Testing for k values from 2 to 9

for num_clusters in cluster_range:
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)  # Initialize KMeans
    kmeans.fit(pca_data)  # Fit the model to PCA-transformed data
    inertia_values.append(kmeans.inertia_)  # Append the inertia (sum of squared distances)

# Plot the elbow method results
plt.plot(cluster_range, inertia_values, marker='o')
plt.title("Elbow Method for Optimal k")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Inertia")
plt.grid(True)
plt.show()

In [None]:
# Optimal k
optimal_k = 3 

# Apply k-means with optimal k
optimal_kmeans = KMeans(n_clusters=optimal_k, random_state=42)
optimal_clusters = optimal_kmeans.fit_predict(pca_data)

# Visualise k-means clusters
plt.scatter(pca_data["PCA1"], pca_data["PCA2"], c=optimal_clusters, alpha=0.5)
plt.title('K-Means Clustering')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.colorbar()
plt.show()

## DBSCAN

In [None]:
from sklearn.cluster import DBSCAN
import numpy as np

# Apply DBSCAN
# eps is the maximum distance between two samples for them to be in the same cluster
# min_samples is minimum number of points required to form a dense region
dbscan = DBSCAN(eps=0.5, min_samples=10)
dbscan_clusters = dbscan.fit_predict(pca_data)

# Visualise DBSCAN Clusters
plt.scatter(pca_data["PCA1"], pca_data["PCA2"], c=dbscan_clusters, alpha=0.5)
plt.title('DBSCAN Clustering')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar()
plt.show()

## Evaluate clusterings 

In [None]:
from sklearn.metrics import silhouette_score

# Silhouette score for K-means
silhouette_kmeans = silhouette_score(pca_data, optimal_clusters)
print(silhouette_kmeans)

# Silhouette score for DBSCAN 
silhouette_dbscan = silhouette_score(pca_data, dbscan_clusters)
print(silhouette_dbscan)
