In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt

#load data
data = pd.read_csv('/content/sp500_sector_prices.csv', index_col = 0, parse_dates = True)

#calculate daily returns
returns = data.pct_change().dropna()

#define rolling features
rolling_window = 5
feature_df = pd.DataFrame(index = returns.index)

for ticker in returns.columns:
  feature_df[f'{ticker}_mean'] = returns[ticker].rolling(rolling_window).mean()
  feature_df[f'{ticker}_vol'] = returns[ticker].rolling(rolling_window).std()
  feature_df[f'{ticker}_momentum'] = (returns[ticker] - returns[ticker].rolling(rolling_window).mean())

#drop NaN rows from rolling
feature_df = feature_df.dropna()

#select target sector
target_ticker = 'XLK' #example (technology sector)
target = returns[target_ticker].shift(-1).dropna()

#allign features and target
X = feature_df.loc[target.index]
y = target

#train/test split
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[:split]
y_train, y_test = y[:split], y[:split]

#train model
model = LinearRegression()
model.fit(X_train, y_train)

#make predictions
y_pred = model.predict(X_test)

#Evaluate
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print(f"R^2 Score: {r2:.4f}")
print(f"RMSE: {rmse:.6f}")
print(f"MAE: {mae.6f}")

#the plots for actual VS predicted
plt.figure(figsize = (12, 6))
plt.plot(y_test.index, y_test.values, label = 'Actual', alpha = 0.7)
plt.plot(y_test.index, y_pred, label = 'Predicted', alpha = 0.7)
plt.title(f"{target_ticker} - Actual VS Predicted Next-Day Returns")
plt.legend()
plt.show()

#save the results
results_df = pd.DataFrame({'Date': y_test.index, 'Actual': y_test.values, 'Predicted': y_pred})
results_df.to_csv('/content/linear_regression_results.csv', index = False)