In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv')

# Normalize column names
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Keep only the relevant columns
df_filtered = df[['ram', 'storage', 'screen', 'final_price']]


In [2]:
# Check for missing values
df_filtered.isnull().sum()

ram            0
storage        0
screen         4
final_price    0
dtype: int64

In [3]:
# Calculate the median of 'ram'
median_ram = df_filtered['ram'].median()
print(median_ram)


16.0


In [4]:
from sklearn.model_selection import train_test_split

# Shuffle and split the data (60% train, 20% validation, 20% test)
df_train, df_temp = train_test_split(df_filtered, test_size=0.4, random_state=42,)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)


In [10]:
# Fill missing values with 0
df_train_zero = df_train.fillna(0)
df_val_zero = df_val.fillna(0)

# Fill missing values with mean from train set
mean_screen = df_train['screen'].mean()
df_train_mean = df_train.fillna(mean_screen)
df_val_mean = df_val.fillna(mean_screen)


In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Train the model with zero-filled data
model_zero = LinearRegression()
model_zero.fit(df_train_zero[['ram', 'storage', 'screen']], df_train_zero['final_price'])
y_pred_zero = model_zero.predict(df_val_zero[['ram', 'storage', 'screen']])
rmse_zero = round(np.sqrt(mean_squared_error(df_val_zero['final_price'], y_pred_zero)), 2)

# Train the model with mean-filled data
model_mean = LinearRegression()
model_mean.fit(df_train_mean[['ram', 'storage', 'screen']], df_train_mean['final_price'])
y_pred_mean = model_mean.predict(df_val_mean[['ram', 'storage', 'screen']])
rmse_mean = round(np.sqrt(mean_squared_error(df_val_mean['final_price'], y_pred_mean)), 2)

print(f"RMSE with 0 filling: {rmse_zero}, RMSE with mean filling: {rmse_mean}")


RMSE with 0 filling: 622.51, RMSE with mean filling: 622.63


In [12]:
from sklearn.linear_model import Ridge

r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
rmse_scores = []

for r in r_values:
    ridge_model = Ridge(alpha=r)
    ridge_model.fit(df_train_zero[['ram', 'storage', 'screen']], df_train_zero['final_price'])
    y_pred_ridge = ridge_model.predict(df_val_zero[['ram', 'storage', 'screen']])
    rmse_ridge = round(np.sqrt(mean_squared_error(df_val_zero['final_price'], y_pred_ridge)), 2)
    rmse_scores.append((r, rmse_ridge))

print(rmse_scores)


[(0, 622.51), (0.01, 622.51), (0.1, 622.51), (1, 622.51), (5, 622.51), (10, 622.5), (100, 622.35)]


In [13]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmse_list = []

for seed in seeds:
    df_train, df_temp = train_test_split(df_filtered, test_size=0.4, random_state=seed)
    df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=seed)
    df_train_zero = df_train.fillna(0)
    df_val_zero = df_val.fillna(0)

    model = LinearRegression()
    model.fit(df_train_zero[['ram', 'storage', 'screen']], df_train_zero['final_price'])
    y_pred = model.predict(df_val_zero[['ram', 'storage', 'screen']])
    rmse = np.sqrt(mean_squared_error(df_val_zero['final_price'], y_pred))
    rmse_list.append(rmse)

std_rmse = round(np.std(rmse_list), 3)
print(f"Standard deviation of RMSE: {std_rmse}")


Standard deviation of RMSE: 35.825


In [16]:
np.random.seed(9)

df_train_full = pd.concat([df_train, df_val]).fillna(0)

ridge_final = Ridge(alpha=0.001, random_state=9)
ridge_final.fit(df_train_full[['ram', 'storage', 'screen']], df_train_full['final_price'])
y_test_pred = ridge_final.predict(df_test[['ram', 'storage', 'screen']])

rmse_test = round(np.sqrt(mean_squared_error(df_test['final_price'], y_test_pred)), 2)
print(f"RMSE on test set: {rmse_test}")


RMSE on test set: 602.43
