<a href="https://colab.research.google.com/github/ccorrad1/Machine-Learning-Project/blob/main/502_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc

# for random forest
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
pip install xgboost

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd

file_path = '/content/drive/MyDrive/Data502.csv'
df = pd.read_csv(file_path)
df.head()

In [None]:
df['tsunami'].value_counts() #checking if there is any skew

In [None]:
#Logistic Regression https://www.geeksforgeeks.org/machine-learning/ml-logistic-regression-using-python/
X = df.drop("tsunami", axis=1)
y = df["tsunami"]

#split the dataset for testing and training. 80% is for training, 20% is for testing the performance of our model
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

#Standardize the data for model training and evaluation
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#train the logistic regression model
log_model = LogisticRegression(max_iter=200)
# log_model.fit(X_train_scaled, y_train)
log_model.fit(X_train, y_train)

#evaluate the model
y_pred = log_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Decision Tree, this def needs to be pruned/fixed
from sklearn import tree
clf = DecisionTreeClassifier(criterion="gini", max_leaf_nodes=12)
clf.fit(X_train, y_train)

#plot tree
tree.plot_tree(clf, filled=True)
accuracy_before_pruning = clf.score(X_test, y_test)
print("Accuracy before pruning:", accuracy_before_pruning)

# Pruning
# https://www.geeksforgeeks.org/machine-learning/pruning-decision-trees/
parameter = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [2, 3, 4, 5, 6, 7],
    'splitter': ['best', 'random'],
    'max_features': ['auto', 'sqrt', 'log2']
}
from sklearn.model_selection import GridSearchCV
cv = GridSearchCV(clf, param_grid = parameter, cv = 5)
cv.fit(X_train, y_train)

"""
from sklearn.tree import export_graphviz
import graphviz
best_estimator = cv.best_estimator_
feature_names = features

tsu_data = export_graphviz(best_estimator, out_file = None, filled=None, rounded = True, feature_names = feature_names, class_names = ['0', '1'])
graph = graphviz.Source(tsu_data)
graph.render("decision_tree", format='png', cleanup=True)
graph
"""

In [None]:
# the example above just uses one tree
# the code i am about to input has many trees combined (random forest) for better predicition

In [None]:
rf_model = RandomForestClassifier(
    n_estimators=300,
    random_state=42
)

rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Random Forest Accuracy: {:.2f}%".format(accuracy * 100))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
def predict_tsunami(
    magnitude, cdi, mmi, sig, nst, dmin, gap, depth,
    latitude, longitude, Year, Month,
    model=rf_model, scaler=scaler
):
    new_event = pd.DataFrame([{
        "magnitude": magnitude,
        "cdi": cdi,
        "mmi": mmi,
        "sig": sig,
        "nst": nst,
        "dmin": dmin,
        "gap": gap,
        "depth": depth,
        "latitude": latitude,
        "longitude": longitude,
        "Year": Year,
        "Month": Month
    }])

    # Scale with the same scaler
    new_event_scaled = scaler.transform(new_event)

    # Predict class (0 or 1)
    prediction = model.predict(new_event_scaled)[0]

    # Probability of tsunami (class = 1)
    probability = model.predict_proba(new_event_scaled)[0][1]

    return prediction, probability

In [None]:
example_pred, example_prob = predict_tsunami(
    magnitude=7.1,
    cdi=5.0,
    mmi=4.2,
    sig=600,
    nst=20,
    dmin=0.03,
    gap=60,
    depth=12.4,
    latitude=35.22,
    longitude=-118.54,
    Year=2025,
    Month=11
)

print("\n--- Example Prediction ---")
print("Predicted tsunami (0=no, 1=yes):", example_pred)
print("Probability of tsunami:", example_prob)

In [None]:
# HERE I START THE CODE FOR XGBOOST

In [None]:
# 1. Create the model
xgb_model = XGBClassifier(
    objective='binary:logistic',
    learning_rate=0.05,
    max_depth=4,
    n_estimators=300,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# 2. Fit the model
xgb_model.fit(X_train, y_train)

# 3. Predict on test data
y_pred = xgb_model.predict(X_test)

# 4. Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("XGBoost Accuracy:", accuracy)

# 5. Classification report (precision, recall, F1)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# 6. Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))