In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load Data

In [None]:
# Removes highly correlated feature based on a threshold
def correlation(dataset, threshold):
    col_corr = set() # Set of all the names of deleted columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j] >= threshold) and (corr_matrix.columns[j] not in col_corr):
                colname = corr_matrix.columns[i] # getting the name of column
                col_corr.add(colname)
                if colname in dataset.columns:
                    del dataset[colname] # deleting the column from the dataset

    return dataset.columns

In [None]:
# Load data and process
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
train_df = pd.read_csv("DRIFT_DATA_TRAIN.csv")
test_df = pd.read_csv("DRIFT_DATA_TEST.csv")

train_df = train_df.drop(['h_cs2smos', 'id_buoy'], axis=1)
test_df = test_df.drop(['h_cs2smos', 'id_buoy'], axis=1)

train_y = np.array(train_df[["u_buoy", "v_buoy"]])
train_df = train_df.drop(['u_buoy', 'v_buoy'], axis=1)

corr_col = correlation(train_df, 0.5)
train_df = train_df[corr_col]

train_x = StandardScaler().fit_transform(np.array(train_df))

train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.30, random_state=42)

In [None]:
# Print columns to check what is getting fed
train_df.columns

# XGBoost

In [None]:
# Run XGBoost
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
xgb = MultiOutputRegressor(XGBRegressor(n_estimators=1000, booster="gbtree", learning_rate=0.01, verbosity=3))
xgb.fit(train_x, train_y)

In [None]:
import pickle
with open("xgb.pkl", "wb") as w:
    pickle.dump(xgb, w)

In [None]:
# Validation set prediction and plots
with open("xgb.pkl", "rb") as r:
    xgb = pickle.load(r)
predictions = xgb.predict(val_x)
from sklearn.metrics import mean_squared_error as mse
print(mse(predictions, val_y))

axis = range(len(val_y[:100]))

fig, axs = plt.subplots(2)
fig.tight_layout()

pred_u = [item[0] for item in predictions[:100]]
act_u = [item[0] for item in val_y[:100]]

pred_v = [item[1] for item in predictions[:100]]
act_v = [item[1] for item in val_y[:100]]

axs[0].plot(axis, pred_u, label="predicted")
axs[0].plot(axis, act_u, label="actual")
axs[0].title.set_text("U Velocity, Pred vs. Actual")
axs[0].set_ylabel("Velocity")
axs[0].legend(loc="best")

axs[1].plot(axis, pred_v, label="predicted")
axs[1].plot(axis, act_v, label="actual")
axs[1].title.set_text("V Velocity, Pred vs. Actual")
axs[1].set_ylabel("Velocity")
axs[1].legend(loc="best")
plt.show()

In [None]:
# Load test data
test_df_final = pd.read_csv("DRIFT_DATA_TEST.csv")
test_df_features = test_df_final[corr_col]
test_data = StandardScaler().fit_transform(np.array(test_df_features))

In [None]:
# Create results
results = xgb.predict(test_data)
u = [item[0] for item in results]
v = [item[1] for item in results]

In [None]:
# Setup the csv
test_df_final = pd.read_csv("DRIFT_DATA_TEST.csv")
test_df_final["u_buoy"] = u
test_df_final["v_buoy"] = v
test_df_final.to_csv("results.csv", index=False)

# SVM

In [None]:
from sklearn.svm import SVR
clf = MultiOutputRegressor(SVR(kernel="rbf"))
clf.fit(train_x, train_y)
pred_svm = clf.predict(val_x)
mse(pred_svm, val_y)