In [None]:
# Import libs
import pandas
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression

# Load the preprocessed dataset
data = pandas.read_csv('preprocessed_data.csv')

# Input features and target
x = data[[ 'latitude', 'longitude', 'year', 'month', 'day', 'hour', 'minute', 'nst', 'gap', 'dmin','rms','horizontalError','depthError','magError','magNst' ]]
y = data[ 'mag' ]

# Split inot training and testing datasets
xtrain, xtest, ytrain, ytest = train_test_split( x, y, test_size = 0.2, random_state = 42 )

## Model 1: Train on all features

In [6]:
full_model = LinearRegression()
full_model.fit( xtrain, ytrain )

# Save the full model
with open( 'full_model.model', 'wb' ) as f:
	pickle.dump( full_model, f )

# load the model from disk and score
loaded_model = pickle.load( open( 'full_model.model', 'rb' ) )
result = loaded_model.score( xtest, ytest )
print( "Full model score: ", result )

Full model score:  0.039113784308980826


## Model 2: Train on Reduced Features

In [7]:
selector = SelectKBest( score_func = f_regression, k = 4 )
xtrain_reduced = selector.fit_transform( xtrain, ytrain )
xtest_reduced = selector.transform( xtest )

# Get and print selected features
selected_features = x.columns[ selector.get_support() ].tolist()
print( "Selected features:", selected_features )

# Train reduced model
reduced_model = LinearRegression()
reduced_model.fit( xtrain_reduced, ytrain )

# Save the reduced model
with open( 'reduced_model.model', 'wb' ) as f:
	pickle.dump( reduced_model, f )

# Load the reduced model and score
loaded_reduced_model = pickle.load( open( 'reduced_model.model', 'rb' ) )
result = loaded_reduced_model.score( xtest_reduced, ytest )
print( "Reduced model score: ", result )

Selected features: ['longitude', 'year', 'month', 'hour']
Reduced model score:  0.03750846670345098
