In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('laptops.csv')
print("{} : \n {} \n".format('total (rows)', len(df)))
display(df.head(2))

total (rows) : 
 2160 



Unnamed: 0,Laptop,Status,Brand,Model,CPU,RAM,Storage,Storage type,GPU,Screen,Touch,Final Price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.0
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.0


In [3]:
df.columns = df.columns.str.lower().str.replace(' ', '_')
laptops = df.copy()

In [4]:
# Missing 157 values in total_bedrooms
laptops.isnull().sum()

laptop             0
status             0
brand              0
model              0
cpu                0
ram                0
storage            0
storage_type      42
gpu             1371
screen             4
touch              0
final_price        0
dtype: int64

In [5]:
print('ram median = ' , laptops['ram'].median())

ram median =  16.0


In [6]:

# Distributions of data
n = len(laptops)
n_train = int(0.6*n)
n_val = int(0.2*n)
#n_test = int(0.2*n)

# Shuffle data
np.random.seed(42)
idx = np.arange(n)
np.random.shuffle(idx)
laptops_shuffled = laptops.iloc[idx] 

# Split data
X_train = laptops_shuffled.iloc[:n_train].copy()
X_val = laptops_shuffled.iloc[n_train:n_train + n_val].copy()
X_test = laptops_shuffled.iloc[n_train+n_val:].copy()

# Apply log transformation
Y_train = np.log1p( X_train['final_price'] ).values
Y_val = np.log1p( X_val['final_price'] ).values
Y_test = np.log1p( X_test['final_price']).values 

# To avoid accidentally using the target variable
del X_train['final_price']
del X_val['final_price']
del X_test['final_price']


In [7]:
def encode_categorical(data):
    label_encoders = {}
    for column in data.columns:
        if data[column].dtype == 'object':
            le = LabelEncoder()
            data[column] = le.fit_transform(data[column])
            label_encoders[column] = le
    return data, label_encoders

In [8]:
# filling missing values with zeros and mean
def handle_nan(df, feature, fillnan_with):
    df_copy = df.copy()
    if fillnan_with == 'mean':
        df_copy[feature].fillna(value = df_copy[feature].mean())
    elif fillnan_with == 'zero':
        df_copy[feature].fillna(value = 0)
        
    return df_copy.values

# Root mean squared error
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    rmse = np.sqrt(mse)
    return rmse

In [9]:
def linear_regression(X, y):
    X = np.array(X, dtype=float)  # Ensure X is numeric
    y = np.array(y, dtype=float)  # Ensure y is numeric
    X_0 = np.ones(X.shape[0])
    X = np.column_stack([X_0, X])
    XTX = X.T.dot(X)
    XTX_inverse = np.linalg.inv(XTX)
    w = XTX_inverse.dot(X.T).dot(y)
    return X.dot(w)

In [10]:
# Encode categorical features
X_train, label_encoders_train = encode_categorical(X_train)
X_val, label_encoders_val = encode_categorical(X_val)

# filling missing values
X_train_zeros = handle_nan(X_train, 'screen', 'zero')
X_val_zeros = handle_nan(X_val, 'screen', 'zero')

X_train_mean = handle_nan(X_train, 'screen', 'mean')
X_val_mean = handle_nan(X_val, 'screen', 'mean')

# For training set
Y_pred = linear_regression(X_train_zeros, Y_train)
rmse_train_zeros = rmse(Y_train, Y_pred)
print('RMSE for train set with zeros: ',round(rmse_train_zeros, 2) )

Y_pred = linear_regression(X_train_mean, Y_train)
rmse_train_mean= rmse(Y_train, Y_pred)
print('RMSE for train set with mean: ',round(rmse_train_mean, 2) )

print('\n')
# For validation set
Y_pred = linear_regression(X_val_zeros, Y_val)
rmse_val_zeros = rmse(Y_val, Y_pred)
print('RMSE for validation set with zeros: ', round(rmse_train_mean, 2) )

Y_pred = linear_regression(X_val_mean, Y_val)
rmse_val_mean = rmse(Y_val, Y_pred)
print('RMSE for validation set with mean: ',round(rmse_train_mean, 2) )

RMSE for train set with zeros:  nan
RMSE for train set with mean:  nan


RMSE for validation set with zeros:  nan
RMSE for validation set with mean:  nan


In [11]:
def ridge_regression(X, y, r = 0.0):
    # adding ones in the dataset X
    X_0 = np.ones(X.shape[0])
    X = np.column_stack([X_0, X])

    XTX = X.T.dot(X)
    # add regularization term rI
    I = np.eye(XTX.shape[0])
    XTX_inverse = np.linalg.inv(XTX + r*I)
    w = XTX_inverse.dot(X.T).dot(y)

    Y_pred = X.dot(w)
    return Y_pred, w

In [12]:
# filling missing values with zeros
X_train = handle_nan(X_train, 'screen', 'zero')
X_val = handle_nan(X_val, 'screen', 'zero')


for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    Y_pred, _ = ridge_regression(X_val, Y_val, r=r)
    rmse_val = round( rmse(Y_val, Y_pred), 5)
    print('%06s %0.5f' % (r, rmse_val))


     0 nan
 1e-06 nan
0.0001 nan
 0.001 nan
  0.01 nan
   0.1 nan
     1 nan
     5 nan
    10 nan


In [13]:
def split_data(df, target_column, train_size = 0.6, 
               val_size = 0.2, seed = 42, log_transform = True):
    
    if train_size + val_size >= 1.0:
        raise ValueError("Value larger then 1")
    
    n = len(df)
    n_train = int(train_size*n)
    n_val = int(val_size*n)
    
    # Shuffle data
    np.random.seed(seed)
    idx = np.arange(n)
    np.random.shuffle(idx)
    df_shuffled = df.iloc[idx]

    # Split data
    X_train = df_shuffled.iloc[:n_train].copy()
    X_val = df_shuffled.iloc[n_train:n_train + n_val].copy()
    X_test = df_shuffled.iloc[n_train + n_val:].copy()

    if log_transform:
        Y_train = np.log1p(X_train[target_column]).values
        Y_val = np.log1p(X_val[target_column]).values
        Y_test = np.log1p(X_test[target_column]).values
    else:
        Y_train = X_train[target_column].values
        Y_val = X_val[target_column].values
        Y_test = X_test[target_column].values

    del X_train[target_column]
    del X_val[target_column]
    del X_test[target_column]

    # Fill missing values with zeros
    X_train = handle_nan(X_train, 'screen', 'zero')
    X_val = handle_nan(X_val, 'screen', 'zero')
    X_test = handle_nan(X_test, 'screen', 'zero')

    X = {'train':X_train, 'val':X_val, 'test':X_test}
    Y = {'train': Y_train,'val':Y_val,'test': Y_test}

    return X,Y

In [14]:
display(laptops.head(2))

seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

errors = []
for seed in seeds:
    X,Y = split_data(df = laptops, target_column= 'final_price', seed = seed)
    Y_pred = linear_regression(X['val'], Y['val'])
    error = rmse(Y['val'], Y_pred)

    print('%10s' %seed, round( error, 3) )
    errors.append( error )   

print('Std =', round(np.std(errors), 3))

Unnamed: 0,laptop,status,brand,model,cpu,ram,storage,storage_type,gpu,screen,touch,final_price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.0
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.0


ValueError: could not convert string to float: 'Lenovo ThinkPad T15p Gen 2 Intel Core i7-11800H/16GB/512GB SSD/GTX 1650/15.6"'

In [47]:
# Split data and fill missing values with zeros
X,Y = split_data(df = laptops, target_column= 'final_price', seed = 9)

# Combine train and validation
X_train = np.concatenate([ X['train'], X['val']])
Y_train = np.concatenate([ Y['train'], Y['val']])

# Train model on train and validation and use in test set
_, w = ridge_regression(X_train, Y_train, r = 0.001)

# Regression with the trained weights
Y_pred = w[0] + X['test'].dot(w[1:])


print('RMSE on test set = ', round( rmse(Y['test'], Y_pred), 2))

TypeError: can't multiply sequence by non-int of type 'str'