### Random seed 1-9. Calculation of std

In [8]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt

In [9]:
df = pd.read_csv('laptops.csv')

In [10]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [11]:
df = df[['ram','storage','screen','final_price']]
df

Unnamed: 0,ram,storage,screen,final_price
0,8,512,15.6,1009.00
1,8,256,15.6,299.00
2,8,256,15.6,789.00
3,16,1000,15.6,1199.00
4,16,512,15.6,669.01
...,...,...,...,...
2155,16,1000,17.3,2699.99
2156,16,1000,17.3,2899.99
2157,32,1000,17.3,3399.99
2158,16,1000,13.4,1899.99


In [12]:
# Missing values with zero
df['screen'] = df['screen'].fillna(0)

### Splitted script into chuncks

In [13]:
# Function to train linear regression
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X]) 

    XTX = X.T.dot(X)                
    XTX_inv = np.linalg.inv(XTX)     
    w = XTX_inv.dot(X.T).dot(y)      
    
    return w[0], w[1:]              

# Function to calculate RMSE
def rmse(y, y_pred):
    error = y - y_pred
    mse = (error ** 2).mean()
    return np.sqrt(mse)


In [17]:
# List to store RMSE results
rmse_results = []

# Define seed range
seeds = range(10)

# Loop over different seeds
for seed in seeds:
    np.random.seed(seed)

    # Shuffle indices
    n = len(df)
    idx = np.arange(n)
    np.random.shuffle(idx)  # Shuffle the indices with the current seed

    # Shuffle the DataFrame using the shuffled indices
    df_shuffler = df.iloc[idx]

    # Define the sizes for train, validation, and test sets
    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)

    # Split into train, validation, and test sets
    df_train = df_shuffler.iloc[:n_train].copy()
    df_val = df_shuffler.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffler.iloc[n_train+n_val:].copy()

    # Transform targe and prepare feature matrices
    y_train = df_train['final_price'].values
    y_val = df_val['final_price'].values
    y_test = df_test['final_price'].values


    df_train.drop(columns=['final_price'], inplace=True)
    df_val.drop(columns=['final_price'], inplace=True)
    df_test.drop(columns=['final_price'], inplace=True)


    X_train = df_train.values
    X_val = df_val.values
    X_test = df_test.values


    # Train the linear regression model
    w_0, w = train_linear_regression(X_train, y_train)

    # Make predictions on validation set
    y_pred_val = w_0 + X_val.dot(w)

    # Calculate RMSE in the original scale
    val_rmse = rmse(y_val, y_pred_val)

    # Store the RMSE result for each seed
    rmse_results.append(val_rmse)

    print(f"Seed {seed}, Validation RMSE: {val_rmse}")

# std for all RMSE 
print("Std RMSE of all seeds:", round(np.std(rmse_results),3))


Seed 0, Validation RMSE: 565.4520868770974
Seed 1, Validation RMSE: 636.7985423056696
Seed 2, Validation RMSE: 588.9558697908011
Seed 3, Validation RMSE: 597.8148920012579
Seed 4, Validation RMSE: 571.9627915111049
Seed 5, Validation RMSE: 573.2383256618974
Seed 6, Validation RMSE: 647.3438328407465
Seed 7, Validation RMSE: 550.4398184485742
Seed 8, Validation RMSE: 587.3335036169904
Seed 9, Validation RMSE: 576.101792943304
Std RMSE of all seeds: 29.176
