<a href="https://colab.research.google.com/github/bekircan4721/Bekircan_arac-/blob/main/Visualizing_machine_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


data_shark = pd.read_csv("https://raw.githubusercontent.com/bekircan4721/Bekircan_arac-/main/attacks.csv", encoding="unicode_escape")
data_temperature = pd.read_table("https://www.ncei.noaa.gov/data/oceans/woa/DATA_ANALYSIS/3M_HEAT_CONTENT/DATA/basin/pentad/pent_h22-w0-2000m.dat", delim_whitespace=True)


data_shark = data_shark[data_shark["Year"].notnull()]
data_shark["Year"] = data_shark["Year"].astype(int)


temperature_by_year = data_temperature[["YEAR", "WO"]].copy()
temperature_by_year.columns = ["Year", "Ocean_Temperature"]
temperature_by_year["Year"] = temperature_by_year["Year"].round().astype(int)


shark_by_year = data_shark.groupby("Year").size().reset_index(name="Attack_Count")


merged = pd.merge(shark_by_year, temperature_by_year, on="Year", how="left")
merged = merged.dropna(subset=["Ocean_Temperature"])

# normalizing distribution -> performing machine learning models
merged["Attack_Count_log"] = np.log1p(merged["Attack_Count"])

# adding feature (country)

shark_by_year_country = data_shark.groupby(["Year", "Country"]).size().reset_index(name="Attack_Count")
df = pd.merge(shark_by_year_country, temperature_by_year, on="Year", how="left")
df = df.dropna(subset=["Ocean_Temperature"])
df["Attack_Count_log"] = np.log1p(df["Attack_Count"])

# since country is not numerical, we need to change its form by using one-hot encoding
df_encoded = pd.get_dummies(df, columns=["Country"], drop_first=True)
features = df_encoded.columns.difference(["Attack_Count", "Attack_Count_log"])

X_feat = df_encoded[features]
y_feat = df_encoded["Attack_Count_log"]

# generating correlation matrix
correlation_matrix = df_encoded.corr()

# visualizing matrix
plt.figure(figsize=(20, 15))
sns.heatmap(correlation_matrix, cmap="coolwarm", annot=False, fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()


X_train, X_test, y_train, y_test = train_test_split(X_feat, y_feat, test_size=0.2, random_state=42)


linear_model = LinearRegression()
linear_model.fit(X_train, y_train)


predictions_of_linear_model = linear_model.predict(X_test)

# I added jitter to decrease density points
jitter_strength = 0.03
y_test_jittered = y_test + np.random.normal(0, jitter_strength, size=len(y_test))
y_pred_jittered = predictions_of_linear_model + np.random.normal(0, jitter_strength, size=len(predictions_of_linear_model))

# visual generating
plt.figure(figsize=(8, 8))
plt.scatter(y_test_jittered, y_pred_jittered, alpha=0.4, s=60)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=3, label='y = x')

plt.xlabel("Real Attack Count")
plt.ylabel("Predicted  Attack_Count_log")
plt.title("Linear Regression: Real vs Predicted (Attack_Count_log)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()