# Evaluation and Analysis
- Apply 4 or more performance metrics to analyze the results
- Include 3 or more visualization techniques for presenting the results
- Compare and analyze all the ML models that you have built for the project and present the performance analysis

## Setup

In [17]:
# Libraries
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
import numpy
import matplotlib.pyplot as plot
import seaborn
import pandas
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from xgboost import XGBRegressor

# Load preprocessed data
data = pandas.read_csv('preprocessed_data.csv')
print("Dataset read.")

# Input features and target
x = data[[ 	'latitude', 'longitude', # Location
		  	'year', 'month', 'day', 'hour', # Time; Removing 'minute' increased scores slightly
			'nst', 'gap', 'dmin',
			'rms', 'horizontalError', 'depthError', 'magNst' # Quality
		]]
y = data[ 'mag' ]
print("Features saved.")

# Split inot training and testing datasets
xtrain, xtest, ytrain, ytest = train_test_split( x, y, test_size = 0.2, random_state = 42 )
print("Data split.")

Dataset read.
Features saved.
Data split.


## ML Models

In [18]:
# Linear
linear = LinearRegression()
linear.fit( xtrain, ytrain )

with open('linear.model', 'wb') as f:
	pickle.dump( linear, f )

loaded_model = pickle.load( open('linear.model', 'rb') )
score_linear = loaded_model.score( xtest, ytest )
print("Linear model score:", score_linear )

Linear model score: 0.06552262168577538


In [19]:
# Decision Tree
decision_tree = DecisionTreeRegressor( random_state = 42 )
decision_tree.fit( xtrain, ytrain )

with open('decision_tree.model', 'wb') as f:
	pickle.dump( decision_tree, f )

loaded_model = pickle.load( open('decision_tree.model', 'rb') )
score_dt = loaded_model.score( xtest, ytest )
print("Decision tree model score:", score_dt )

Decision tree model score: -0.05212163903138589


In [20]:
# Random Forest
random_forest = RandomForestRegressor( n_estimators = 100, n_jobs = -1, random_state = 42 )
random_forest.fit( xtrain, ytrain )

with open('random_forest.model', 'wb') as f:
	pickle.dump( random_forest, f )

loaded_model = pickle.load( open('random_forest.model', 'rb') )
score_rf = loaded_model.score( xtest, ytest )
print("Random forest model score:", score_rf )

Random forest model score: 0.45987526234652387


In [21]:
# Gradient Boosting
gradient_regression = HistGradientBoostingRegressor( max_iter = 200, random_state=42 )
gradient_regression.fit( xtrain, ytrain)

with open('gradient_regression.model', 'wb') as f:
	pickle.dump( gradient_regression, f )

loaded_model = pickle.load( open('gradient_regression.model', 'rb') )
score_gb = loaded_model.score( xtest, ytest )
print("Gradient regression model score:", score_gb )

Gradient regression model score: 0.43206115878759843


In [None]:
# Bagging
bagging_regression = BaggingRegressor( n_estimators=100, n_jobs = -1, random_state=42 )
bagging_regression.fit( xtrain, ytrain)

with open('bagging_regression.model', 'wb') as f:
    pickle.dump ( bagging_regression, f)

loaded_model = pickle.load( open('bagging_regression.model', 'rb') )
score_bag = loaded_model.score( xtest, ytest )
print("Bagging Regression model score:", score_bag )

In [None]:
# XGBoost
xgb = XGBRegressor(
    n_estimators = 200,
    learning_rate = 0.05,
    max_depth = 6,
    subsample = 0.8,
    colsample_bytree = 0.8,
    n_jobs = -1,
    random_state = 42
)
xgb.fit( xtrain, ytrain )

with open('xgboost.model', 'wb') as f:
    pickle.dump( xgb, f )

loaded_model = pickle.load( open( 'xgboost.model', 'rb' ))
score_xgb = loaded_model.score( xtest, ytest )
print("XGBoost model score:", score_xgb )

XGBoost model score: 0.412589720125651


## Evaluation

In [None]:
# Store predictions for comparison
models = {
    "Linear": score_linear,
    "Decision Tree": score_dt,
    "Random Forest": score_rf,
    "Gradient Boosting": score_gb,
    "Bagging": score_bag,
    "XGBoost": score_xgb
}

metrics = {
    "R²": [],
    "MAE": [],
    "MSE": [],
    "RMSE": [],
    "MAPE": []
}

for name, model in models.items():
    y_pred = model.predict( xtest )
    metrics["R²"].append( r2_score( ytest, y_pred ))
    metrics["MAE"].append( mean_absolute_error( ytest, y_pred ))
    metrics["MSE"].append( mean_squared_error( ytest, y_pred ))
    metrics["RMSE"].append( numpy.sqrt( mean_squared_error( ytest, y_pred )))
    metrics["MAPE"].append( mean_absolute_percentage_error( ytest, y_pred ))

results_df = pandas.DataFrame( metrics, index=models.keys() )
results_df = results_df.round( 4 )
display( results_df )

# Visualizations
# Heatmap of metrics

# Bar plot of R2 scores

# MAE and RMSE comparison
