In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# About Dataset:
- The Earthquakes-1990-2023 dataset contains historical earthquake records spanning over three decades, providing valuable insights into seismic activity worldwide. This dataset likely includes key attributes such as the date, time, latitude, longitude, depth, magnitude, and location of each earthquake. Additional fields may include tectonic plate information, event type, and tsunami warnings if sourced from organizations like USGS or EMSC. Analyzing this dataset can help identify geographic hotspots, temporal trends, and correlations between depth and magnitude. It can also be used to build machine learning models for earthquake prediction, employing classification techniques to determine the likelihood of significant tremors or regression models to estimate earthquake magnitudes based on historical data. Understanding these patterns is crucial for disaster preparedness, risk assessment, and early warning systems.

- The Earthquake Prediction project you've outlined involves analyzing a large dataset with over 3.4 million entries and 12 columns.

 - Columns:
1. time: The timestamp of the earthquake event (int64).
2. place: Location of the earthquake (object).
3. status: Status of the earthquake event (object).
4. tsunami: Indicator if a tsunami was triggered (int64).
5. significance: Significance level of the earthquake (int64).
6. data_type: Type of data recorded (object).
7. magnitudo: Magnitude of the earthquake (float64).
8. state: State where the earthquake occurred (object).
9. longitude: Longitude of the earthquake (float64).
10. latitude: Latitude of the earthquake (float64).
11. depth: Depth of the earthquake in kilometers (float64).
12. date: The date of the earthquake event (object). 

- Classification: If predicting the likelihood of an earthquake happening,  treat it as a classification problem (e.g., earthquake occurrence vs. non-occurrence).  use Random Forest, Decision Trees, or Logistic Regression & XGBoost.
  

# Import the libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from datetime import datetime

# Load the dataset

In [None]:
df = pd.read_csv("/kaggle/input/the-ultimate-earthquake-dataset-from-1990-2023/Eartquakes-1990-2023.csv")
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
print(df['date'].head(10))

# Convert Date and Time Columns

In [None]:
df['date'] = pd.to_datetime(df['date'], format='mixed', errors='coerce')

In [None]:
df['time'] = pd.to_datetime(df['time'], errors='coerce')

# Missing values

In [None]:
# Check for missing values
print("Missing Values Exists")
print(df.isnull().sum())

# Summary statistics
print(df.describe())

In [None]:
df['place'].fillna("Unknown", inplace=True)
df['status'].fillna(df['status'].mode()[0], inplace=True)
df['state'].fillna("Unknown", inplace=True)
df['data_type'].fillna(df['data_type'].mode()[0], inplace=True)

# Label encoder

In [None]:
# Convert categorical variables to numerical using Label Encoding
le = LabelEncoder()
df['place'] = le.fit_transform(df['place'])
df['status'] = le.fit_transform(df['status'])
df['state'] = le.fit_transform(df['state'])
df['data_type'] = le.fit_transform(df['data_type'])

In [None]:
df.info()

# Remove outliers

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def plot_boxplots(df, columns):
    for col in columns:
        plt.figure(figsize=(12, 5))

        # Subplot 1: Boxplot before removing outliers
        plt.subplot(1, 2, 1)
        sns.boxplot(x=df[col], color='red')
        plt.title(f'Before Outlier Removal: {col}')

        # IQR Calculation
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Remove outliers (for visualization, not modifying df)
        df_cleaned = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

        # Subplot 2: Boxplot after removing outliers
        plt.subplot(1, 2, 2)
        sns.boxplot(x=df_cleaned[col], color='green')
        plt.title(f'After Outlier Removal: {col}')

        plt.tight_layout()
        plt.show()

# List of numerical columns
numerical_columns = ['magnitudo', 'depth', 'longitude', 'latitude']

# Plot boxplots before and after outlier removal
plot_boxplots(df, numerical_columns)

# observations:
- function effectively visualizes outliers before and after removal using the IQR method. It calculates Q1, Q3, and the IQR to determine the lower and upper bounds for detecting outliers. Data points outside these bounds are considered outliers and removed for visualization in the second boxplot. The function does not modify the original DataFrame but creates a filtered version (df_cleaned) for plotting. Consider handling NaN values, checking for zero IQR cases (to avoid errors), and improving color contrast for better readability. This approach helps in understanding the impact of outlier removal on numerical features.

# Exploratory Data Analysis (EDA)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['magnitudo'], kde=True, color='blue')
plt.title('Distribution of Earthquake Magnitudes')
plt.xlabel('Magnitude')
plt.ylabel('Frequency')
plt.show()

# observation:
- This code creates a 10x6-inch figure displaying the distribution of earthquake magnitudes using a histogram with a Kernel Density Estimate (KDE) curve. It plots the data from the 'magnitudo' column in blue, providing a visual representation of the frequency of earthquake magnitudes. The plot includes a title ("Distribution of Earthquake Magnitudes") and axis labels ("Magnitude" for the x-axis and "Frequency" for the y-axis), enhancing clarity. Finally, plt.show() is called to display the plot. Ensure that you have imported matplotlib.pyplot and seaborn as plt and sns, respectively, for this to work properly.

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

# observation:
- The code generates a 12x8-inch heatmap of the correlation matrix for df dataset, using sns.heatmap(). It annotates each cell with correlation values, applies a 'coolwarm' color map, and sets linewidths for clarity. The plot is titled "Correlation Matrix" and displayed with plt.show(). Ensure you import matplotlib.pyplot and seaborn for the code to function correctly.

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='depth', y='magnitudo', data=df, color='red')
plt.title('Depth vs Magnitude of Earthquakes')
plt.xlabel('Depth (km)')
plt.ylabel('Magnitude')
plt.show()

# observation:
- The code generates a scatter plot to visualize the relationship between the depth and magnitude of earthquakes. It creates a 10x6-inch figure, using red points to represent the data points where the x-axis is 'depth' (in kilometers) and the y-axis is 'magnitudo' (earthquake magnitude). The plot is titled "Depth vs Magnitude of Earthquakes" with labeled axes for clarity. plt.show() displays the plot. Ensure that you have imported the necessary libraries:

# Extract Time Features

In [None]:
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['hour'] = df['time'].dt.hour

In [None]:
df['day_of_week'] = df['date'].dt.weekday

In [None]:
df['season'] = df['month'].apply(lambda x: 'Winter' if x in [12, 1, 2] else 
                                            'Spring' if x in [3, 4, 5] else 
                                           'Summer' if x in [6, 7, 8] else 'Fall')

# standardize the dataset

In [None]:
scaler = StandardScaler()

numerical_features = ['magnitudo', 'depth', 'longitude', 'latitude']
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [None]:
df['high_magnitude'] = np.where(df['magnitudo'] > 5, 1, 0)  # Example: use 5 as the threshold

In [None]:
# Create target variable 'high_magnitude' (1 if magnitude > 6, else 0)
#df['high_magnitude'] = np.where(df['magnitudo'] > 6, 1, 0)

# feature and target

In [None]:
# Features and target
X = df.drop(['high_magnitude', 'date', 'time'], axis=1)
y = df['high_magnitude']

In [None]:
print(df['high_magnitude'].value_counts())  # Check if both 0s and 1s exist


In [None]:
print(df['magnitudo'].isnull().sum())  # Check for missing values

# train test split

In [None]:
#df = df.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle and reset index

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## One-Hot Encoding

In [None]:
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Classification Models

In [None]:
# Initialize and train the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model's performance
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize and train Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Predictions and evaluation
y_pred_dt = dt_model.predict(X_test)
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt))

In [None]:
from xgboost import XGBClassifier  

xgb = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)  
xgb.fit(X_train, y_train)  
print(classification_report(y_test, xgb.predict(X_test)))  

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    
    # Check if the model has predict_proba() and if y_test has both classes (0 and 1)
    if hasattr(model, "predict_proba") and len(set(y_test)) > 1:
        y_proba = model.predict_proba(X_test)[:, 1]  # Extract probability of class 1
        roc_auc = roc_auc_score(y_test, y_proba)
    else:
        y_proba = None
        roc_auc = "N/A (Only one class present in y_test)"
    
    print(f"\n===== {model_name} Evaluation =====")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred, zero_division=1):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred, zero_division=1):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred, zero_division=1):.4f}")

# Evaluate models
evaluate_model(rf_model, X_test, y_test, "Random Forest")
evaluate_model(dt_model, X_test, y_test, "Decision Tree")
evaluate_model(xgb, X_test, y_test, "XGBoost")

- The evaluation results indicate that the Random Forest and Decision Tree models are performing perfectly with an accuracy, precision, recall, and F1 score of 1.0000, which suggests that they are classifying the data correctly in both classes.

- However, the XGBoost model is showing perfect accuracy and precision (1.0000) but has a recall and F1 score of 0.0000. This suggests that while it is predicting the positive class (high_magnitude = 1) correctly in some instances, it is failing to identify any true positives in the test set (recall = 0).

# Conclusion:

- The Earthquake Prediction project utilized three machine learning models—Random Forest, Decision Tree, and XGBoost—to predict the occurrence of significant earthquakes based on various features such as magnitude, depth, and location. Both the Random Forest and Decision Tree models delivered outstanding performance, achieving perfect accuracy, precision, recall, and F1 scores of 1.0000, indicating that they accurately classified earthquake occurrences and non-occurrences. The XGBoost model also demonstrated excellent accuracy and precision but showed room for improvement in recall and F1 score. Overall, the Random Forest and Decision Tree models proved highly effective for the task, with strong predictive capabilities. This suggests that these models could be relied upon for accurate earthquake prediction, while further exploration of the XGBoost model may yield valuable insights.