In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv('sea surface temperature.csv',skiprows=14, header=None)


df.columns = ['DATETIME', 'TIME', 'LON', 'LAT', 'SST']

df = df.drop('DATETIME', axis=1)


df['SST'] = df['SST'].replace(-1.0E+34, np.nan)

daily_means = df.groupby('TIME')['SST'].mean()

for time, group in df.groupby('TIME'):
    df.loc[(df['TIME'] == time) & (df['SST'].isna()), 'SST'] = daily_means[time]

print(df)

          TIME      LON      LAT        SST
0      44194.5  83.6039  15.9814  27.004600
1      44194.5  84.0844  15.9814  27.055400
2      44194.5  81.2013  16.3151  26.693600
3      44194.5  81.6818  16.3151  26.680700
4      44194.5  82.1623  16.3151  26.724300
...        ...      ...      ...        ...
20374  44557.5  82.1623  18.4608  26.993319
20375  44557.5  82.6428  18.4608  26.993319
20376  44557.5  83.1233  18.4608  26.993319
20377  44557.5  83.6039  18.4608  26.993319
20378  44557.5  84.0844  18.4608  26.630600

[20379 rows x 4 columns]


In [2]:
X = df[['TIME', 'LON', 'LAT']]
y = df['SST']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("Linear Regression Results:")
print(f"Mean Squared Error: {mse_lr:.6f}")
print(f"R² Score: {r2_lr:.6f}")

coefficients = pd.DataFrame({
    'Feature': ['TIME', 'LON', 'LAT'],
    'Coefficient': lr_model.coef_
})
intercept = lr_model.intercept_

print("Model Coefficients:")
print(coefficients)
print(f"Intercept: {intercept:.6f}")


Linear Regression Results:
Mean Squared Error: 1.507607
R² Score: 0.057740
Model Coefficients:
  Feature  Coefficient
0    TIME     0.003088
1     LON    -0.004634
2     LAT    -0.032777
Intercept: -107.392613


In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score



rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("\nRandom Forest Results:")
print(f"Mean Squared Error: {mse_rf:.6f}")
print(f"R² Score: {r2_rf:.6f}")

feature_importance = pd.DataFrame({
    'Feature': ['TIME', 'LON', 'LAT'],
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("Feature Importance:")
print(feature_importance)



Random Forest Results:
Mean Squared Error: 0.003174
R² Score: 0.998016
Feature Importance:
  Feature  Importance
0    TIME    0.967495
2     LAT    0.016295
1     LON    0.016210


In [6]:
last_day = df['TIME'].max()
next_day = last_day + 1.0

unique_locations = df[['LON', 'LAT']].drop_duplicates()

next_day_df = pd.DataFrame()
next_day_df['TIME'] = [next_day] * len(unique_locations)
next_day_df['LON'] = unique_locations['LON'].values
next_day_df['LAT'] = unique_locations['LAT'].values

lr_predictions = lr_model.predict(next_day_df)
rf_predictions = rf_model.predict(next_day_df)

next_day_df['LR_Predicted_SSH'] = lr_predictions
next_day_df['RF_Predicted_SSH'] = rf_predictions

print(f"\nPredictions for day {next_day} (relative to Jan 1, 1901):")
print(next_day_df.head(10))



Predictions for day 44559.5 (relative to Jan 1, 1901):
      TIME      LON      LAT  LR_Predicted_SSH  RF_Predicted_SSH
0  44559.5  83.6039  15.9814         29.292581         27.358481
1  44559.5  84.0844  15.9814         29.290354         27.576484
2  44559.5  81.2013  16.3151         29.292777         27.048844
3  44559.5  81.6818  16.3151         29.290550         26.908744
4  44559.5  82.1623  16.3151         29.288323         26.857612
5  44559.5  82.6428  16.3151         29.286097         26.900171
6  44559.5  83.1233  16.3151         29.283870         27.118387
7  44559.5  83.6039  16.3151         29.281643         27.338326
8  44559.5  84.0844  16.3151         29.279417         27.437125
9  44559.5  81.2013  16.6555         29.281619         26.992770
