In [11]:
import numpy as np
import pandas as pd
from mgwr.gwr import MGWR, GWR
from mgwr.sel_bw import Sel_BW
from shapely.geometry import Point
import geopandas as gpd
import matplotlib.pyplot as plt


In [12]:

data = pd.read_csv('../../data/train.csv')


In [13]:

data['current_range'] = data['current_max'] - data['current_min']


In [14]:

predictors = ['current_mean', 'current_range', 'gebco']
response = 'mean_gs'


In [15]:

gdf = gpd.GeoDataFrame(data, geometry=gpd.points_from_xy(data['x'], data['y']))
gdf = gdf.set_crs(epsg=4326)  # Assuming WGS84
gdf = gdf.to_crs(epsg=32633)  # Convert to UTM zone 33N

data['x'] = gdf.geometry.x
data['y'] = gdf.geometry.y


In [16]:

coords = data[['x', 'y']].values
X = data[predictors].values
y = data[response].values.reshape(-1, 1)


In [17]:
selector = Sel_BW(coords, y, X, multi=False) # This creates the bandwidths for different input features
bws = selector.search(verbose=True, search_method='golden_section', max_iter=20) # This searches for the optimal bandwidth (fields of influence)
print(bws)

Bandwidth:  836.0 , score:  9886.94
Bandwidth:  1322.0 , score:  9935.94
Bandwidth:  535.0 , score:  9832.32
Bandwidth:  349.0 , score:  9759.59
Bandwidth:  234.0 , score:  9713.81
Bandwidth:  163.0 , score:  9705.43
Bandwidth:  119.0 , score:  9720.16
Bandwidth:  190.0 , score:  9707.09
Bandwidth:  146.0 , score:  9709.61
Bandwidth:  173.0 , score:  9705.83
Bandwidth:  156.0 , score:  9707.70
Bandwidth:  167.0 , score:  9705.41
Bandwidth:  169.0 , score:  9705.82
Bandwidth:  165.0 , score:  9705.10
165.0


In [18]:

mgwr_model = GWR(coords, y, X, bws)
results = mgwr_model.fit() # This fits the model to the data
print(results.summary()) # This prints the summary of the model fit


Model type                                                         Gaussian
Number of observations:                                                2110
Number of covariates:                                                     4

Global Regression Results
---------------------------------------------------------------------------
Residual sum of squares:                                          15236.636
Log-likelihood:                                                   -5079.711
AIC:                                                              10167.422
AICc:                                                             10169.450
BIC:                                                               -883.621
R2:                                                                   0.122
Adj. R2:                                                              0.121

Variable                              Est.         SE  t(Est/SE)    p-value
------------------------------- ---------- ---------- ------

In [19]:

test_data = pd.read_csv('../../data/test.csv')


In [20]:

test_gdf = gpd.GeoDataFrame(test_data, geometry=gpd.points_from_xy(test_data['x'], test_data['y']))
test_gdf = test_gdf.set_crs(epsg=4326)
test_gdf = test_gdf.to_crs(epsg=32633)

test_data['x'] = test_gdf.geometry.x
test_data['y'] = test_gdf.geometry.y


In [21]:

test_data['current_range'] = test_data['current_max'] - test_data['current_min']


In [22]:

test_coords = test_data[['x', 'y']].values
test_X = test_data[predictors].values


In [26]:
scale = results.scale
residuals = results.resid_response
test_data['mean_gs'] = mgwr_model.predict(test_coords, test_X, scale, residuals).predictions # This predicts the response variable for the test data using the fitted model


In [27]:

submission = test_data[['id', 'mean_gs']]
submission.to_csv('test_submission_mgwr.csv', index=False)
