In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
np.random.seed(2049)

## Load data

In [None]:
X = np.load("data/X_processed.npy")
X_test_with_ids = np.load("data/X_processed_test.npy")
y = np.load("data/y.npy")

In [None]:
X_test, ids = X_test_with_ids[:, 1:], X_test_with_ids[:, 0]

## Split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

## Classify

In [None]:
clf = LinearRegression()
clf.fit(X_train, y_train)

preds_train = clf.predict(X_train)
preds_val = clf.predict(X_val)
preds_test = clf.predict(X_test)

## Performance

In [None]:
print(f"Train RMSE: {mean_squared_error(y_train, preds_train)**0.5}")
print(f"Val RMSE: {mean_squared_error(y_val, preds_val)**0.5}")

In [None]:
predictions = np.concatenate((ids[:, None], preds_test[:, None]), axis=1)

## Saving results

In [None]:
df = pd.DataFrame(predictions, columns=['Id', 'y'])
df = df.astype({'Id': 'int32'})
df.to_csv('data/test_preds.csv', index=False)