In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv('salinity.csv',skiprows=14, header=None)


df.columns = ['DATETIME', 'TIME', 'LON', 'LAT', 'DEP','SALT']

df = df.drop('DATETIME', axis=1)


df['SALT'] = df['SALT'].replace(-1.0E+34, np.nan)

daily_means = df.groupby('TIME')['SALT'].mean()

for time, group in df.groupby('TIME'):
    df.loc[(df['TIME'] == time) & (df['SALT'].isna()), 'SALT'] = daily_means[time]

print(df)


X = df[['TIME', 'LON', 'LAT']]
y = df['SALT']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)





#Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("Linear Regression Results:")
print(f"Mean Squared Error: {mse_lr:.6f}")
print(f"R² Score: {r2_lr:.6f}")

coefficients = pd.DataFrame({
    'Feature': ['TIME', 'LON', 'LAT'],
    'Coefficient': lr_model.coef_
})
intercept = lr_model.intercept_

print("Model Coefficients:")
print(coefficients)
print(f"Intercept: {intercept:.6f}")





#Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score



rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("\nRandom Forest Results:")
print(f"Mean Squared Error: {mse_rf:.6f}")
print(f"R² Score: {r2_rf:.6f}")

feature_importance = pd.DataFrame({
    'Feature': ['TIME', 'LON', 'LAT'],
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("Feature Importance:")
print(feature_importance)



          TIME    LON      LAT  DEP       SALT
0      44194.5  83.25  15.9798    5  30.231600
1      44194.5  83.75  15.9798    5  32.022100
2      44194.5  81.25  16.3134    5  27.598800
3      44194.5  81.75  16.3134    5  27.677900
4      44194.5  82.25  16.3134    5  27.813800
...        ...    ...      ...  ...        ...
17463  44557.5  81.75  18.4590    5  31.086408
17464  44557.5  82.25  18.4590    5  31.086408
17465  44557.5  82.75  18.4590    5  31.086408
17466  44557.5  83.25  18.4590    5  31.086408
17467  44557.5  83.75  18.4590    5  31.086408

[17468 rows x 5 columns]
Linear Regression Results:
Mean Squared Error: 2.443780
R² Score: 0.012886
Model Coefficients:
  Feature  Coefficient
0    TIME     0.000390
1     LON     0.124357
2     LAT    -0.141613
Intercept: 6.167745

Random Forest Results:
Mean Squared Error: 0.003913
R² Score: 0.998419
Feature Importance:
  Feature  Importance
0    TIME    0.841837
2     LAT    0.082540
1     LON    0.075623


In [5]:
last_day = df['TIME'].max()
next_day = last_day + 1.0

unique_locations = df[['LON', 'LAT']].drop_duplicates()

next_day_df = pd.DataFrame()
next_day_df['TIME'] = [next_day] * len(unique_locations)
next_day_df['LON'] = unique_locations['LON'].values
next_day_df['LAT'] = unique_locations['LAT'].values

lr_predictions = lr_model.predict(next_day_df)
rf_predictions = rf_model.predict(next_day_df)

next_day_df['LR_Predicted_SSH'] = lr_predictions
next_day_df['RF_Predicted_SSH'] = rf_predictions

print(f"\nPredictions for day {next_day} (relative to Jan 1, 1901):")
print(next_day_df.head(10))



Predictions for day 44558.5 (relative to Jan 1, 1901):
      TIME    LON      LAT  LR_Predicted_SSH  RF_Predicted_SSH
0  44558.5  83.25  15.9798         31.652642         32.171433
1  44558.5  83.75  15.9798         31.714821         33.041675
2  44558.5  81.25  16.3134         31.356685         30.258862
3  44558.5  81.75  16.3134         31.418864         30.770100
4  44558.5  82.25  16.3134         31.481043         30.900226
5  44558.5  82.75  16.3134         31.543221         31.067766
6  44558.5  83.25  16.3134         31.605400         31.966325
7  44558.5  83.75  16.3134         31.667579         32.822510
8  44558.5  81.25  16.6538         31.308480         31.092874
9  44558.5  81.75  16.6538         31.370659         31.086964
