<a href="https://colab.research.google.com/github/bekircan4721/Bekircan_arac-/blob/main/Machine_learning1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import shapiro, anderson, kstest, normaltest, zscore, probplot, pearsonr
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.utils import resample
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


data_shark = pd.read_csv("https://raw.githubusercontent.com/bekircan4721/Bekircan_arac-/main/attacks.csv", encoding="unicode_escape")
data_temperature = pd.read_table("https://www.ncei.noaa.gov/data/oceans/woa/DATA_ANALYSIS/3M_HEAT_CONTENT/DATA/basin/pentad/pent_h22-w0-2000m.dat", delim_whitespace=True)

# filtering on year
data_shark = data_shark[data_shark["Year"].notnull()]
data_shark["Year"] = data_shark["Year"].astype(int)

# merging temperature with year
temperature_by_year = data_temperature[["YEAR", "WO"]].copy()
temperature_by_year.columns = ["Year", "Ocean_Temperature"]
temperature_by_year["Year"] = temperature_by_year["Year"].round().astype(int)

# yearly attack_count
shark_by_year = data_shark.groupby("Year").size().reset_index(name="Attack_Count")

# merging data's on year feature
merged = pd.merge(shark_by_year, temperature_by_year, on="Year", how="left")
merged = merged.dropna(subset=["Ocean_Temperature"])

# taking log in order to obtain normalized distrubution
merged["Attack_Count_log"] = np.log1p(merged["Attack_Count"])

# graphic parts

# Histplot
for col in ["Attack_Count", "Ocean_Temperature"]:
    plt.figure(figsize=(8, 4))
    sns.histplot(merged[col], kde=True)
    plt.title(f"{col} - Histogram & KDE")
    plt.show()

# Histogram ve KDE
for col in ["Attack_Count_log", "Ocean_Temperature"]:
    plt.figure(figsize=(8, 4))
    sns.histplot(merged[col], kde=True)
    plt.title(f"{col} - Histogram & KDE")
    plt.show()


# checking if distributions have normal distribution
for col in ["Attack_Count", "Ocean_Temperature"]:
    standardized = zscore(merged[col])
    stat, p = kstest(standardized, 'norm')
    print(f"K-S Test(without normalization) - {col} (z-score): stat={stat:.4f}, p={p:.4f}")


# k-s test again, now we will check that taking logarithm of attack_count works for normalization
for col in ["Attack_Count_log", "Ocean_Temperature"]:
    standardized = zscore(merged[col])
    stat, p = kstest(standardized, 'norm')
    print(f"K-S Test(Logarithm for attack_count) - {col} (z-score): stat={stat:.4f}, p={p:.4f}")

# linear regression year based

X = merged[["Ocean_Temperature"]]
y = merged["Attack_Count_log"]

model = LinearRegression()
model.fit(X, y)

y_pred = model.predict(X)
print(f"Linear Regression R^^2: {r2_score(y, y_pred):.4f}")
print(f"Linear Regression MSE: {mean_squared_error(y, y_pred):.4f}")

# Bootstrap sampling lists
r2_scores = []
mse_scores = []

X_vals = X.values
y_vals = y.values
n_iterations = 1000
n_size = len(merged)

for i in range(n_iterations):
    X_resampled, y_resampled = resample(X_vals, y_vals, n_samples=n_size, replace=True)
    model = LinearRegression()
    model.fit(X_resampled, y_resampled)
    y_pred = model.predict(X_resampled)
    r2_scores.append(r2_score(y_resampled, y_pred))
    mse_scores.append(mean_squared_error(y_resampled, y_pred))

print(f"R^^2 mean: {np.mean(r2_scores):.4f}, 95% CI: ({np.percentile(r2_scores, 2.5):.4f}, {np.percentile(r2_scores, 97.5):.4f})")
print(f"MSE mean: {np.mean(mse_scores):.4f}, 95% CI: ({np.percentile(mse_scores, 2.5):.4f}, {np.percentile(mse_scores, 97.5):.4f})")

# random forest model

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print(f"Random Forest R^^2: {r2_score(y_test, y_pred_rf):.4f}")
print(f"Random Forest MSE: {mean_squared_error(y_test, y_pred_rf):.4f}")

# adding feature part

shark_by_year_country = data_shark.groupby(["Year", "Country"]).size().reset_index(name="Attack_Count")
df = pd.merge(shark_by_year_country, temperature_by_year, on="Year", how="left")
df = df.dropna(subset=["Ocean_Temperature"])
df["Attack_Count_log"] = np.log1p(df["Attack_Count"])

# One-hot encode Country because this feature is discrete
df_encoded = pd.get_dummies(df, columns=["Country"], drop_first=True)
features = df_encoded.columns.difference(["Attack_Count", "Attack_Count_log"])

X_feat = df_encoded[features]
y_feat = df_encoded["Attack_Count_log"]

X_train_feat, X_test_feat, y_train_feat, y_test_feat = train_test_split(X_feat, y_feat, test_size=0.2, random_state=42)


# these lists will hold the values of bootstrap samples
r2_scores = []
mse_scores = []

X_vals = X_feat.values
y_vals = y_feat.values
n_size = len(df_encoded)
n_iterations = 1000


for i in range(n_iterations):
    X_resampled, y_resampled = resample(X_vals, y_vals, n_samples=n_size, replace=True)
    model = LinearRegression()
    model.fit(X_resampled, y_resampled)
    y_pred = model.predict(X_resampled)
    r2_scores.append(r2_score(y_resampled, y_pred))
    mse_scores.append(mean_squared_error(y_resampled, y_pred))

print(f"After feature adding :R^^2(mean){np.mean(r2_scores):.4f}, 95% CI: ({np.percentile(r2_scores, 2.5):.4f}, {np.percentile(r2_scores, 97.5):.4f})")
print(f"After feature adding: MSE(mean): {np.mean(mse_scores):.4f}, 95% CI: ({np.percentile(mse_scores, 2.5):.4f}, {np.percentile(mse_scores, 97.5):.4f})")


# Linear Regression after adding feature
model_feat = LinearRegression()
model_feat.fit(X_train_feat, y_train_feat)
y_pred_feat = model_feat.predict(X_test_feat)

print(f"Linear Regression R^^2 (with features): {r2_score(y_test_feat, y_pred_feat):.4f}")
print(f"Linear Regression MSE (with features): {mean_squared_error(y_test_feat, y_pred_feat):.4f}")

coefs = pd.Series(model_feat.coef_, index=features).sort_values(key=abs, ascending=False)
print("\nBest 10 feature for model (using Linear Regression)")
print(coefs.head(10))

# Random Forest modeli (features ile)
random_forest_features = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest_features.fit(X_train_feat, y_train_feat)
predicted_random_forest_values = random_forest_features.predict(X_test_feat)

print(f"\nRandom Forest R^^2 (with features): {r2_score(y_test_feat, predicted_random_forest_values):.4f}")
print(f"Random Forest MSE (with features): {mean_squared_error(y_test_feat, predicted_random_forest_values):.4f}")

feat_importances = pd.Series(random_forest_features.feature_importances_, index=features).sort_values(ascending=False)
print("\nBest 10 feature for model (Using Random Forest):")
print(feat_importances.head(10))


