In [None]:
import kagglehub
import os
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [None]:
path = kagglehub.dataset_download("camnugent/sandp500") + "/individual_stocks_5yr/individual_stocks_5yr/AAL_data.csv"
df = pd.read_csv(path)

print(df.shape)
print(df.info())
print(df.head())
print(df.isnull().sum())
print(df.isna().sum())
print(df.describe(include="all"))

In [None]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42, shuffle=False)
target = 'close'
train_data['date'] = pd.to_datetime(train_data['date']).astype('int64') // 10**9
test_data['date'] = pd.to_datetime(test_data['date']).astype('int64') // 10**9

In [None]:
X_train = train_data[['date']]
y_train = train_data[target]

X_test = test_data[['date']]
y_test = test_data[target]

model = LinearRegression().fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate MAE and RMSE
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Square Error (RMSE): {rmse:.4f}")

In [None]:
x_values = X_test['date'].values 
y_values = y_test.values

order = np.argsort(x_values)

plt.figure(figsize=(8, 6))
plt.scatter(x_values, y_values, color="steelblue", alpha=0.6, label="Actual data")
plt.plot(x_values[order], y_pred[order], color="crimson", linewidth=2, label="Regression line")
plt.title("Simple Linear Regression (stock data): date vs close")
plt.xlabel("date")
plt.ylabel("close")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()