<a href="https://colab.research.google.com/github/bekircan4721/Bekircan_arac-/blob/main/kmeans_graphs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


data_shark = pd.read_csv("https://raw.githubusercontent.com/bekircan4721/Bekircan_arac-/main/attacks.csv", encoding="unicode_escape")
data_temperature = pd.read_table("https://www.ncei.noaa.gov/data/oceans/woa/DATA_ANALYSIS/3M_HEAT_CONTENT/DATA/basin/pentad/pent_h22-w0-2000m.dat", delim_whitespace=True)


data_shark = data_shark[data_shark["Year"].notnull()]
data_shark["Year"] = data_shark["Year"].astype(int)


temperature_by_year = data_temperature[["YEAR", "WO"]].copy()
temperature_by_year.columns = ["Year", "Ocean_Temperature"]
temperature_by_year["Year"] = temperature_by_year["Year"].round().astype(int)


shark_by_year = data_shark.groupby("Year").size().reset_index(name="Attack_Count")


merged = pd.merge(shark_by_year, temperature_by_year, on="Year", how="left")
merged = merged.dropna(subset=["Ocean_Temperature"])

# normalizing distribution -> performing machine learning models
merged["Attack_Count_log"] = np.log1p(merged["Attack_Count"])

# adding feature (country)

shark_by_year_country = data_shark.groupby(["Year", "Country"]).size().reset_index(name="Attack_Count")
df = pd.merge(shark_by_year_country, temperature_by_year, on="Year", how="left")
df = df.dropna(subset=["Ocean_Temperature"])
df["Attack_Count_log"] = np.log1p(df["Attack_Count"])

# since country is not numerical, we need to change its form by using one-hot encoding
df_encoded = pd.get_dummies(df, columns=["Country"], drop_first=True)
features = df_encoded.columns.difference(["Attack_Count", "Attack_Count_log"])

X_feat = df_encoded[features]
y_feat = df_encoded["Attack_Count_log"]


pca = PCA(n_components=2)
reduced_features = pca.fit_transform(df_encoded[features])

# K means clustering, Hard clustering
kmeans = KMeans(n_clusters=5, random_state=42)
# note that features is defined in feature adding part.
cluster_labels = kmeans.fit_predict(df_encoded[features])

# adding the result of cluster to the encoded data, so we can use the data for ML models
df_encoded['cluster'] = cluster_labels

# one hot encoding the clustered part, so we can perform Ml models
cluster_encoded = pd.get_dummies(df_encoded['cluster'], prefix='cluster')

# extending features part, because we got a new column clustered part,
# we are searching whether this will improve our model
X_with_cluster = pd.concat([df_encoded[features], cluster_encoded], axis=1)
y = df_encoded['Attack_Count_log']

# training model
# note that, X_with_cluster is our features, y is our target feature.
X_train, X_test, y_train, y_test = train_test_split(X_with_cluster, y, test_size=0.2, random_state=42)

# performing random forest model after clustering modification.
random_forest_after_k_means = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest_after_k_means.fit(X_train, y_train)


k_means_predictions = random_forest_after_k_means.predict(X_test)
print(f"Hard Clustering + Random Forest R^^s2: {r2_score(y_test, k_means_predictions):.4f}")
print(f"MSE: {mean_squared_error(y_test, k_means_predictions):.4f}")

kmeans = KMeans(n_clusters=5, random_state=42)
kmeans_labels = kmeans.fit_predict(df_encoded[features])

plt.figure(figsize=(8, 6))
plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=kmeans_labels, s=50)
plt.title("K-Means Clustering")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.grid(True)
plt.tight_layout()
plt.show()

print()

# Soft clustering
# I will use GaussianMixture to obtain soft clustering model
gassuian_model = GaussianMixture(n_components=5, random_state=42)
gassuian_model.fit(df_encoded[features])
cluster_probs = gassuian_model.predict_proba(df_encoded[features])

# transforming clusters into a dataframe, so we can add this to our encoded dataframe
cluster_probs_df = pd.DataFrame(cluster_probs, columns=[f'cluster_{i}' for i in range(5)], index=df_encoded.index)

# features
X_with_soft_cluster = pd.concat([df_encoded[features], cluster_probs_df], axis=1)
y = df_encoded['Attack_Count_log']

# again note that, X_with soft cluster is concatened features that includes soft clustering model's result
X_train, X_test, y_train, y_test = train_test_split(X_with_soft_cluster, y, test_size=0.2, random_state=42)

# evaluating the performance difference after soft clustering
random_forest_after_soft = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest_after_soft.fit(X_train, y_train)


predictions_soft = random_forest_after_soft.predict(X_test)
print(f"Soft Clustering + Random Forest R^^2: {r2_score(y_test, predictions_soft):.4f}")
print(f"MSE: {mean_squared_error(y_test, predictions_soft):.4f}")

visual_gaussian = GaussianMixture(n_components=5, random_state=42)
gaussian_labels = visual_gaussian.fit_predict(df_encoded[features])

plt.figure(figsize=(8, 6))
plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=gaussian_labels, cmap='plasma', s=50)
plt.title("Gaussian Mixture (Soft Clustering)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.grid(True)
plt.tight_layout()
plt.show()


print()

# 1. Flat clustering
# I will use Agglomerative method to flat clustering
agglomerative_method = AgglomerativeClustering(n_clusters=5)
cluster_labels = agglomerative_method.fit_predict(df_encoded[features])

# modifying encoded dataframe with the result of agglomerative method's result
# note that I only changed cluster feature, not adding a new column(past modifications do not change my model)
df_encoded['cluster'] = cluster_labels

# since cluster is categorical value, I need one-hot encoding
cluster_encoded = pd.get_dummies(df_encoded['cluster'], prefix='cluster')

# extending features that we search
X_with_cluster = pd.concat([df_encoded[features], cluster_encoded], axis=1)
y = df_encoded['Attack_Count_log']

# again note thaat X_with cluster contains the result of flat clustering
X_train, X_test, y_train, y_test = train_test_split(X_with_cluster, y, test_size=0.2, random_state=42)

# re-evaluating the random forest model's performance after flat clustering
flat_random_forest = RandomForestRegressor(n_estimators=100, random_state=42)
flat_random_forest.fit(X_train, y_train)


flat_predictions = flat_random_forest.predict(X_test)
print(f"Flat Clustering + Random Forest R^^2: {r2_score(y_test, flat_predictions):.4f}")
print(f"MSE: {mean_squared_error(y_test, flat_predictions):.4f}")

visual_agglo = AgglomerativeClustering(n_clusters=5)
agglo_labels = visual_agglo.fit_predict(df_encoded[features])

plt.figure(figsize=(8, 6))
plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=agglo_labels, cmap='coolwarm', s=50)
plt.title("Agglomerative Clustering")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.grid(True)
plt.tight_layout()
plt.show()


print()
