In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

# Load the data
df = pd.read_csv('laptops.csv')

# Normalize column names
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Select only required columns
columns_to_use = ['ram', 'storage', 'screen', 'final_price']
df_filtered = df[columns_to_use]

# Question 1: Check for missing values
print("\nMissing values:")
print(df_filtered.isnull().sum())

# Question 2: Find median of 'ram'
median_ram = df_filtered['ram'].median()
print(f"\nMedian RAM: {median_ram}")

# Prepare dataset for splitting
np.random.seed(42)
df_shuffled = df_filtered.sample(frac=1, random_state=42)

# Split the data
train_df, remaining = train_test_split(df_shuffled, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(remaining, test_size=0.5, random_state=42)

# Function to prepare X and y
def prepare_X_y(df, fill_na_with=0):
    X = df[['ram', 'storage', 'screen']].copy()
    # If fill_na_with is 'mean', we use the training mean
    if isinstance(fill_na_with, str) and fill_na_with == 'mean':
        fill_value = train_df['screen'].mean()
    else:
        fill_value = fill_na_with
    X['screen'] = X['screen'].fillna(fill_value)
    y = df.final_price.values
    return X, y

# Question 3: Compare filling NA with 0 vs mean
def train_linear_reg(X, y, r=0):
    model = Ridge(alpha=r, solver='sag', random_state=42)
    model.fit(X, y)
    return model

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Train and evaluate with 0
X_train_zero, y_train = prepare_X_y(train_df, fill_na_with=0)
X_val_zero, y_val = prepare_X_y(val_df, fill_na_with=0)
model_zero = train_linear_reg(X_train_zero, y_train)
y_pred_zero = model_zero.predict(X_val_zero)
rmse_zero = round(rmse(y_val, y_pred_zero), 2)

# Train and evaluate with mean
X_train_mean, y_train = prepare_X_y(train_df, fill_na_with='mean')
X_val_mean, y_val = prepare_X_y(val_df, fill_na_with='mean')
model_mean = train_linear_reg(X_train_mean, y_train)
y_pred_mean = model_mean.predict(X_val_mean)
rmse_mean = round(rmse(y_val, y_pred_mean), 2)

print(f"\nRMSE with 0: {rmse_zero}")
print(f"RMSE with mean: {rmse_mean}")

# Question 4: Try different regularization values
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
rmse_scores = {}

for r in r_values:
    model = train_linear_reg(X_train_zero, y_train, r=r)
    y_pred = model.predict(X_val_zero)
    rmse_scores[r] = round(rmse(y_val, y_pred), 2)

best_r = min(rmse_scores, key=rmse_scores.get)
print(f"\nBest r value: {best_r} with RMSE: {rmse_scores[best_r]}")

# Question 5: Try different seeds
seeds = list(range(10))
seed_scores = []

for seed in seeds:
    np.random.seed(seed)
    train_df_seed, remaining_seed = train_test_split(df_shuffled, test_size=0.4, random_state=seed)
    val_df_seed, test_df_seed = train_test_split(remaining_seed, test_size=0.5, random_state=seed)
    
    X_train_seed, y_train_seed = prepare_X_y(train_df_seed, fill_na_with=0)
    X_val_seed, y_val_seed = prepare_X_y(val_df_seed, fill_na_with=0)
    
    model_seed = train_linear_reg(X_train_seed, y_train_seed)
    y_pred_seed = model_seed.predict(X_val_seed)
    seed_scores.append(rmse(y_val_seed, y_pred_seed))

std_score = round(np.std(seed_scores), 3)
print(f"\nStandard deviation of RMSE scores: {std_score}")

# Question 6: Final model evaluation
np.random.seed(9)
train_val_df, test_df_final = train_test_split(df_shuffled, test_size=0.2, random_state=9)

X_train_val, y_train_val = prepare_X_y(train_val_df, fill_na_with=0)
X_test, y_test = prepare_X_y(test_df_final, fill_na_with=0)

final_model = train_linear_reg(X_train_val, y_train_val, r=0.001)
y_pred_final = final_model.predict(X_test)
final_rmse = round(rmse(y_test, y_pred_final), 2)

print(f"\nFinal test RMSE: {final_rmse}")


Missing values:
ram            0
storage        0
screen         4
final_price    0
dtype: int64

Median RAM: 16.0

RMSE with 0: 674.76
RMSE with mean: 674.67





Best r value: 100 with RMSE: 674.74





Standard deviation of RMSE scores: 28.129

Final test RMSE: 552.65


