<a href="https://colab.research.google.com/github/allakoala/data_science/blob/main/colab_notebooks/Clustering_Homework_(part_1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#HW - https://drive.google.com/file/d/1JNwW2rGFTRIryYmBeOe8IENrjY-LDFcJ/view
The dataset:
https://drive.google.com/file/d/1S5stL1xz51y5QkGRHHfWqteacHDYOYXM/view?usp=sharing

It contains information about earthquakes: coordinates, depth and strength.

Your task is to cluster the data

Q-s: Linkage;
Dendrogram plot;
Silhouette coefficient;

In [None]:
!pip install bds_courseware

In [None]:
from bds_courseware import read_drive_dataset
from bds_courseware import print_dataset_description, print_module_datasets
from bds_courseware import HOMEWORK_DATASETS

print("Dataset names: ", list(HOMEWORK_DATASETS.keys())) # Changed to list() for better display
name = "quake"
df = read_drive_dataset(*HOMEWORK_DATASETS[name])
print(df.head(10))
print(df.shape)

#Exploratory analysis (there are some specific characteristics of features, which may change the result of the analysis)

1. We see that our dataset's latitudes follow a multimodal distribution of a trimodal distribution to be precise; I assume we can use clustering to cluster the 3 different groups of latitudes and try and understand why those clusters have similar latitudes and what unites those clusters.

2. The Laltitude follows a trimodal distribution similar to the latitude. We will perform clustering on those features in a later stage of this kernel, hopefully giving some insight into different groups of earthquake sites

3. When we look at the distribution of earthquake depths, we see that most earthquakes follow a bimodal distribution around depth 60. Still, we have some records of earthquakes occurring at depth 600-700, rare and defined as black swans. We will ignore those values in the next steps to see the true distribution without an extremely long tail.

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

#find duplicate rows
duplicate_rows = df.duplicated(subset=df.columns, keep="first")
duplicate_rows.sum()

In [None]:
#remove duplicate rows
data = df.drop_duplicates()
print(data.info())
print(data.describe())

In [None]:
#for each dataset column print unique values
for col in data.columns:
    n_unique_values = data[col].nunique()
    unique_values = data[col].unique()
    print(f"{col}: {n_unique_values}: {unique_values}")

In [None]:
#missing data for each variable and way to handle it. missing data can imply a reduction of the sample size

total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
total_2 = data.isna().sum().sort_values(ascending=False)
percent_2 = (data.isna().sum()/data.isna().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent, total_2, percent_2], axis=1, keys=['Tota_null', 'Percent_null', 'Total_na', 'Percent_na'])
missing_data

In [None]:
#the histogram
for col in data.columns:
    sns.histplot(data=data, x=col, kde=True)
    plt.show()

In [None]:
#scatterplot method
sns.set()
sns.pairplot(data, size = 2.5)
plt.show();

In [None]:
#outliers detection

#Interquartile Range (IQR) method (values outside the normal range)
for col in data.columns:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = data[(data[col] < lower) | (data[col] > upper)]
    print(f"{col} has {len(outliers)} outliers")

In [None]:
#heatmap style
sns.set(style='darkgrid')
corrmat = data.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, annot=True, fmt=".2f", cmap='coolwarm', vmax=.8, square=True)
plt.show()

# print highly positively correlated pairs
pos_corr_pairs = []
for i in range(len(corrmat.columns)):
    for j in range(i+1, len(corrmat.columns)):
        if abs(corrmat.iloc[i, j]) >= 0.5:
            pos_corr_pairs.append((corrmat.columns[i], corrmat.columns[j]))

print("Highly positively correlated pairs:")
for pair in pos_corr_pairs:
    print(pair)

# print highly negatively correlated pairs
neg_corr_pairs = []
for i in range(len(corrmat.columns)):
    for j in range(i+1, len(corrmat.columns)):
        if abs(corrmat.iloc[i, j]) <=  -0.5:
            neg_corr_pairs.append((corrmat.columns[i], corrmat.columns[j]))

print("Highly negatively correlated pairs:")
for pair in neg_corr_pairs:
    print(pair)

#Build K-Means with 15 clusters


In [None]:
from sklearn.cluster import KMeans

# apply K-Means clustering with 15 clusters
kmeans = KMeans(n_clusters=15)
kmeans.fit(data)

# number of points in each cluster
unique, counts = np.unique(kmeans.labels_, return_counts=True)
print('The number of points per cluster dictionsry:')
print(dict(zip(unique, counts)))

#Determine optimal number of clusters for K-Means


In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from gap_statistic import OptimalK #!pip install gap-stat
import matplotlib.pyplot as plt
import warnings
from sklearn.exceptions import ConvergenceWarning

# Suppress future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=ConvergenceWarning)

# method 1: Elbow method
distortions = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(data)
    distortions.append(kmeans.inertia_)

plt.plot(range(2, 11), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.title('Elbow Method')
plt.show()

# find the optimal number of clusters for the Elbow method
elbow_optimal_k = np.argmin(np.diff(distortions)) + 2
print('Elbow Method suggests optimal number of clusters: ', elbow_optimal_k)

# method 2: Silhouette method
silhouette_scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(data)
    score = silhouette_score(data, kmeans.labels_)
    silhouette_scores.append(score)

plt.plot(range(2, 11), silhouette_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette score')
plt.title('Silhouette Method')
plt.show()

# find the optimal number of clusters for the Silhouette method
silhouette_optimal_k = silhouette_scores.index(max(silhouette_scores)) + 2
print('Silhouette Method suggests optimal number of clusters: ', silhouette_optimal_k)

# method 3: Calinski-Harabasz index
scores = []
for k in range(2, 21):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(data)
    score = calinski_harabasz_score(data, labels)
    scores.append(score)

# plot the scores
plt.plot(range(2, 21), scores)
plt.xlabel('Number of clusters')
plt.ylabel('Calinski-Harabasz score')
plt.show()

# find the optimal number of clusters
optimal_k = np.argmax(scores) + 2
print("Optimal number of clusters: ", optimal_k)

#Create visualization for the obtained clusters


In [None]:
# KMeans clustering with optimal number of clusters
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(data)
labels = kmeans.labels_

# plot the clusters
plt.figure(figsize=(10, 5))
plt.scatter(data['Longitude'], data['Latitude'], c=labels, cmap='viridis')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Clusters')
plt.show()

#1. Other clustering algorithms; tune their hyperparameters to achieve better results.


#2. Use labelled and unlabeled metrics to estimate quality of clusters you built. As ground truth use cluster labels from K-Means, choose most similar algorithm

1. Since K-Means was used as the ground truth for labelling, we can use the Adjusted Rand Index (ARI) as the labelled metric to measure the similarity between the K-Means labels and DBSCAN labels. For the unlabeled metric, we can use the silhouette score. The Adjusted Rand Index (ARI) measures the similarity between the true labels and the predicted labels of the clustering algorithm. An ARI of 1 means that the predicted labels perfectly match the true labels, while an ARI of 0 means that the predicted labels are no better than random.

2. The data has been clustered using both KMeans and Agglomerative clustering methods, and the best parameters for each method have been determined to be n_clusters=4.

3. The quality of the clusters has been evaluated using both labeled (Adjusted Rand Score) and unlabeled (Silhouette Score) metrics.
The KMeans method has a higher Silhouette Score (0.64) than the Agglomerative method (0.63), indicating that the KMeans method has generated more distinct clusters.


#3. Explain the final choice of best clusterizations: give interpretation of clusters

It's possible that the resulting clusters may represent regions of the world that are more prone to earthquakes, or areas with certain geological characteristics that make them more susceptible to seismic activity.
The descriptive statistics show that the clusters differ in terms of focal depth, latitude, longitude, and Richter magnitude:
Cluster 0 appears to be characterized by earthquakes with shallow focal depths and occurring in a variety of locations, with a mean Richter magnitude of 5.98.

Cluster 1 is characterized by earthquakes with deep focal depths and occurring primarily in a region with negative latitudes and longitudes, with a mean Richter magnitude of 5.93.

Cluster 2 is characterized by earthquakes with shallow focal depths occurring primarily in a region with positive longitudes, with a mean Richter magnitude of 5.98.

Cluster 3 is characterized by earthquakes with intermediate focal depths occurring primarily in a region with positive latitudes and longitudes, with a mean Richter magnitude of 5.98.

It is not possible to draw definitive conclusions about the physical or geological characteristics of the earthquakes in each cluster based on the available data, but the differences in focal depth, location, and magnitude suggest that the clusters may represent different types of earthquakes or seismic activity.


In [None]:
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, calinski_harabasz_score, adjusted_rand_score

# KMeans clustering algorithm
kmeans_scores = []
for n_clusters in range(2, 10):
    kmeans_model = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans_labels = kmeans_model.fit_predict(data)
    kmeans_silhouette = silhouette_score(data, kmeans_labels)
    kmeans_scores.append(kmeans_silhouette)
best_kmeans_idx = np.argmax(kmeans_scores)
best_kmeans_model = KMeans(n_clusters=best_kmeans_idx+2, random_state=42)
best_kmeans_labels = best_kmeans_model.fit_predict(data)
best_kmeans_score = kmeans_scores[best_kmeans_idx]

# Calculate adjusted_rand_score for KMeans
kmeans_ari = adjusted_rand_score(best_kmeans_labels, best_kmeans_labels)

# Calculate descriptive statistics for KMeans clusters
kmeans_clusters = pd.DataFrame(data)
kmeans_clusters['Cluster'] = best_kmeans_labels
kmeans_stats = kmeans_clusters.groupby('Cluster').describe()

print(f"Best KMeans parameters: n_clusters={best_kmeans_idx+2}")
print(f"KMeans Silhouette Score: {best_kmeans_score:.2f}")
print(f"KMeans Adjusted Rand Score: {kmeans_ari:.2f}")
print(f"KMeans Cluster Descriptive Statistics:\n{kmeans_stats}")

# Agglomerative Clustering algorithm
agg_scores = []
for n_clusters in range(2, 10):
    agg_model = AgglomerativeClustering(n_clusters=n_clusters)
    agg_labels = agg_model.fit_predict(data)
    agg_silhouette = silhouette_score(data, agg_labels)
    agg_scores.append(agg_silhouette)
best_agg_idx = np.argmax(agg_scores)
best_agg_model = AgglomerativeClustering(n_clusters=best_agg_idx+2)
best_agg_labels = best_agg_model.fit_predict(data)
best_agg_score = agg_scores[best_agg_idx]

# Calculate adjusted_rand_score for Agglomerative Clustering
agg_ari = adjusted_rand_score(best_kmeans_labels, best_agg_labels)

# Calculate descriptive statistics for Agglomerative Clustering clusters
agg_clusters = pd.DataFrame(data)
agg_clusters['Cluster'] = best_agg_labels
agg_stats = agg_clusters.groupby('Cluster').describe()

print(f"Best Agglomerative Clustering parameters: n_clusters={best_agg_idx+2}")
print(f"Agglomerative Clustering Silhouette Score: {best_agg_score:.2f}")
print(f"Agglomerative Clustering Adjusted Rand Score: {agg_ari:.2f}")
print(f"Agglomerative Clustering Cluster Descriptive Statistics:\n{agg_stats}")

# DBSCAN clustering algorithm
dbscan_scores = []
for eps in [0.1, 0.5, 1, 2, 5, 10]:
    for min_samples in range(2, 10):
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        dbscan_labels = dbscan.fit_predict(data)
        if len(np.unique(dbscan_labels)) > 1:
            dbscan_silhouette = silhouette_score(data, dbscan_labels)
            dbscan_scores.append(dbscan_silhouette)
best_dbscan_idx = np.argmax(dbscan_scores)
best_dbscan_eps = [0.1, 0.5, 1, 2, 5, 10][best_dbscan_idx // 8]
best_dbscan_min_samples = list(range(2, 10))[best_dbscan_idx % 8]
best_dbscan_eps_idx = best_dbscan_idx // 8
best_dbscan_eps = [0.1, 0.5, 1, 2, 5, 10][best_dbscan_eps_idx]
best_dbscan_model = DBSCAN(eps=best_dbscan_eps, min_samples=best_dbscan_min_samples)
best_dbscan_labels = best_dbscan_model.fit_predict(data)
best_dbscan_score = dbscan_scores[best_dbscan_idx]

#Calculate adjusted_rand_score for DBSCAN
dbscan_ari = adjusted_rand_score(best_kmeans_labels, best_dbscan_labels)

#Calculate descriptive statistics for DBSCAN clusters
dbscan_clusters = pd.DataFrame(data)
dbscan_clusters['Cluster'] = best_dbscan_labels
dbscan_stats = dbscan_clusters.groupby('Cluster').describe()

print(f"Best DBSCAN parameters: eps={best_dbscan_eps}, min_samples={best_dbscan_min_samples}")
print(f"DBSCAN Silhouette Score: {best_dbscan_score:.2f}")
print(f"DBSCAN Adjusted Rand Score: {dbscan_ari:.2f}")
print(f"DBSCAN Cluster Descriptive Statistics:\n{dbscan_stats}")

In [None]:
# plot the results
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
plt.scatter(data['Longitude'], data['Latitude'], c=best_kmeans_labels, cmap='viridis')
plt.title(f'KMeans Clustering (Score: {best_kmeans_score:.2f})')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.subplot(1, 3, 2)
plt.scatter(data['Longitude'], data['Latitude'], c=best_agg_labels, cmap='viridis')
plt.title(f'Agglomerative Clustering (Score: {best_agg_score:.2f})')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.subplot(1, 3, 3)
plt.scatter(data['Longitude'], data['Latitude'], c=best_dbscan_labels, cmap='viridis')
plt.title('DBSCAN Clustering')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.show()

In [None]:
import plotly.express as ex

data['Cluster_DBSCAN'] = dbscan.labels_

fig = ex.scatter_3d(data,x='Longitude',y='Focal depth',z='Latitude',color='Cluster_DBSCAN',height=900)
fig.show()

#Visualize the best clusterizations in your opinion on the world map
Geographically, the areas with the highest frequency and intensity of earthquakes are typically located along the Ring of Fire, a horseshoe-shaped region that encircles the Pacific Ocean. This region includes the west coast of North and South America, Japan, Indonesia, and the Philippines, among other countries.

Other regions with high earthquake activity include the Himalayan region in Asia, the Alpide Belt in Europe and Asia, and the Mid-Atlantic Ridge. These regions are all characterized by active tectonic plate boundaries or volcanic activity.

In [None]:
import folium

data['B_Cluster'] = agg_labels

# create a dictionary of DataFrames for each cluster
labels = {}
for cluster in data['B_Cluster'].unique():
    labels[cluster] = data.loc[data['B_Cluster'] == cluster, ['Latitude', 'Longitude', 'Richter', 'Focal depth']]

# create a map centered at (0, 0)
world_map = folium.Map(location=[0, 0], zoom_start=2)

# create a feature group for each cluster
for cluster in labels.keys():
    feature_group = folium.FeatureGroup(name=f'B_Cluster {cluster}')

    # add markers to the feature group for each earthquake in the cluster
    for index, row in labels[cluster].iterrows():
        folium.Marker([row['Latitude'], row['Longitude']],
                      popup=f"Magnitude: {row['Richter']}, Depth: {row['Focal depth']} km").add_to(feature_group)

    # add the feature group to the map
    feature_group.add_to(world_map)

# add a layer control to the map
folium.LayerControl().add_to(world_map)

# display the map
world_map