In [2]:
# Libraries
import pandas
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Load preprocessed data
data = pandas.read_csv("augmented_data.csv")
print("Dataset read.")

# Input features and target
x = data[[ 	'latitude', 'longitude', # Location
		  	'year', 'month', 'day', 'hour', # Time; Removing 'minute' increased scores slightly
			'nst', 'gap', 'dmin',
			'rms', 'horizontalError', 'depthError', 'magNst' # Quality
		]]
y = data[ 'mag' ]
print("Features saved.")

# Split inot training and testing datasets
xtrain, xtest, ytrain, ytest = train_test_split( x, y, test_size = 0.2, random_state = 42 )
print("Data split.")

real_xtest = xtest[ ytest > 0 ]
real_ytest = ytest[ ytest > 0 ]

Dataset read.
Features saved.
Data split.


In [3]:
# Focus only on real quake data for tuning
real_xtune = xtrain[ytrain > 0]
real_ytune = ytrain[ytrain > 0]

# Param grid to search
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 3],
    'max_features': ['sqrt', 'log2']
}

# Grid search setup
grid = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42, n_jobs=-1),
    param_grid=param_grid,
    scoring='r2',
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Run grid search on real-quake training data
grid.fit(real_xtune, real_ytune)

# Best model
best_rf = grid.best_estimator_
print("✅ Best Parameters:", grid.best_params_)

# Evaluate on real test set
real_xtest = xtest[ytest > 0]
real_ytest = ytest[ytest > 0]
real_score = best_rf.score(real_xtest, real_ytest)
print("🎯 Real quake R² score (tuned):", real_score)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
✅ Best Parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
🎯 Real quake R² score (tuned): 0.466028595335911


In [4]:
# Random Forest
random_forest = RandomForestRegressor(
	n_estimators = 200,
	max_depth = 20,
	max_features = 'sqrt',
	min_samples_leaf = 1,
	min_samples_split = 2,
	n_jobs = -1,
	random_state = 42,
)
random_forest.fit( xtrain, ytrain )

with open('random_forest.model', 'wb') as f:
	pickle.dump( random_forest, f )

random_forest_model = pickle.load( open('random_forest.model', 'rb') )
score_rf = random_forest_model.score( xtest, ytest )
real_score_rf = random_forest_model.score( real_xtest, real_ytest )
print("Real test score:", real_score_rf )
print("Random forest model score:", score_rf )

Real test score: 0.465908868228333
Random forest model score: 0.9462657070967342
