In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression as SKLinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
import numpy as np
X = np.genfromtxt('stock_prediction_data.csv', delimiter=',')
y = np.genfromtxt('stock_price.csv', delimiter=',')
y = y.reshape(-1, 1)

X_train, X_rest, y_train, y_rest = train_test_split(X, y, test_size=0.2)
X_test, X_val, y_test, y_val = train_test_split(X_rest, y_rest, test_size=0.5)

# use standard scaler to normalize the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

def linear_feature_map(X):
    return np.hstack((np.ones((X.shape[0], 1)), X))


class MyLinearRegression:
    def __init__(self, alpha=0.001, max_iter=10_000):
        self.alpha = alpha
        self.max_iter = max_iter
    
    def fit_gd(self, X, y):
        """
        Fit the model using gradient descent

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data
        y : array-like, shape (n_samples,)
            Target values
        """

        m,n = X.shape
        thetas = np.random.randn(n,1)

        for _ in range(self.max_iter):
            error = X.dot(thetas) - y
            d_loss = X.T.dot(error) / m

            if np.all(np.abs(d_loss) < 1e-5) or np.isnan(d_loss).any(): # check for convergence
                break
            
            if np.isinf(d_loss).any(): # check for gradient explosion
                raise ValueError("Gradient exploded")

            thetas -= self.alpha * d_loss
        
        self._coef = thetas

    def fit_closed_form(self, X, y):
        """
        Fit the model using closed form solution

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data
        y : array-like, shape (n_samples,)
            Target values
        """

        self._coef = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
    
    def predict(self, X):
        """
        Predict using the linear model

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Samples

        Returns
        -------
        y : array-like, shape (n_samples,)
            Returns predicted values
        """

        return X.dot(self._coef)

def mse(y, y_hat):
    return np.mean((y - y_hat)**2)


X_poly_train = PolynomialFeatures(degree=2, include_bias=True).fit_transform(X_train)
X_poly_val = PolynomialFeatures(degree=2, include_bias=True).fit_transform(X_val)
poly_gd_model = MyLinearRegression(alpha=0.01, max_iter=100_000)
poly_gd_model.fit_gd(X_poly_train, y_train)
pred_train = poly_gd_model.predict(X_poly_train)
print(f"My GD Train MSE: {mse(y_train, pred_train)}")
pred_val = poly_gd_model.predict(X_poly_val)
print(f"My GD Validation MSE: {mse(y_val, pred_val)}")

sk_poly_lr = SKLinearRegression()
sk_poly_lr.fit(X_poly_train,y_train.flatten()) # y is 2D, but scikit-learn expects 1D
pred_train = sk_poly_lr.predict(X_poly_train).reshape(-1,1)
print(f"SKLearn Train MSE: {mse(y_train, pred_train)}")
pred_val = sk_poly_lr.predict(X_poly_val).reshape(-1,1)
print(f"SKLearn Validation MSE: {mse(y_val, pred_val)}")

# 0.01 lambda value? 
lasso_poly_lr = Lasso(0.01).fit(X_poly_train, y_train)
y_hat = lasso_poly_lr.predict(X_poly_train).reshape(-1,1)
print(f"Lasso Train MSE: {mse(y_train, pred_train)}")
pred_val = lasso_poly_lr.predict(X_poly_val).reshape(-1,1)
print(f"Lasso Validation MSE: {mse(y_val, pred_val)}")

My GD Train MSE: 0.03309418215379092
My GD Validation MSE: 0.06285105215117612
SKLearn Train MSE: 0.03309417984558429
SKLearn Validation MSE: 0.0628384989736995
Lasso Train MSE: 0.03309417984558429
Lasso Validation MSE: 0.04964376491872274


FileNotFoundError: [Errno 2] No such file or directory: 'problem_1_data.csv'