In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load dataset
df = pd.read_csv("laptopPrice.csv")
df.head()

Unnamed: 0,brand,processor_brand,processor_name,processor_gnrtn,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,Touchscreen,msoffice,Price,rating,Number of Ratings,Number of Reviews
0,ASUS,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,34649,2 stars,3,0
1,Lenovo,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,38999,3 stars,65,5
2,Lenovo,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,39999,3 stars,8,1
3,ASUS,Intel,Core i5,10th,8 GB,DDR4,512 GB,0 GB,Windows,32-bit,2 GB,Casual,No warranty,No,No,69990,3 stars,0,0
4,ASUS,Intel,Celeron Dual,Not Available,4 GB,DDR4,0 GB,512 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,26990,3 stars,0,0


# Data Exploration
Check the structure of our dataset, look for nulls, and understand the feature types.

In [2]:
df.info()
df.describe()
df.nunique()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 823 entries, 0 to 822
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   brand              823 non-null    object
 1   processor_brand    823 non-null    object
 2   processor_name     823 non-null    object
 3   processor_gnrtn    823 non-null    object
 4   ram_gb             823 non-null    object
 5   ram_type           823 non-null    object
 6   ssd                823 non-null    object
 7   hdd                823 non-null    object
 8   os                 823 non-null    object
 9   os_bit             823 non-null    object
 10  graphic_card_gb    823 non-null    object
 11  weight             823 non-null    object
 12  warranty           823 non-null    object
 13  Touchscreen        823 non-null    object
 14  msoffice           823 non-null    object
 15  Price              823 non-null    int64 
 16  rating             823 non-null    object
 1

Unnamed: 0,0
brand,0
processor_brand,0
processor_name,0
processor_gnrtn,0
ram_gb,0
ram_type,0
ssd,0
hdd,0
os,0
os_bit,0


In [3]:
# Data Cleaning + Feature Engineering

# Clean RAM, SSD, HDD, GPU, etc.
def clean_memory(val):
    return int(val.split()[0])

df['ram_gb'] = df['ram_gb'].apply(clean_memory)
df['ssd'] = df['ssd'].apply(clean_memory)
df['hdd'] = df['hdd'].apply(clean_memory)
df['graphic_card_gb'] = df['graphic_card_gb'].apply(clean_memory)

# Dropping non-useful or subjective columns
df.drop(columns=['weight', 'warranty', 'rating'], inplace=True)

# Encode categorical values
le = LabelEncoder()
categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

In [4]:
# Define input and target
X = df.drop(columns=["Price"])
y = df["Price"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Model Training

# Random Forest
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

# Gradient Boosting
gb_model = GradientBoostingRegressor()
gb_model.fit(X_train, y_train)

In [6]:
# Model Evaluation

def evaluate_model(model):
    y_pred = model.predict(X_test)
    print("MAE:", mean_absolute_error(y_test, y_pred))
    print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

print("Random Forest:")
evaluate_model(rf_model)

print("\nGradient Boosting:")
evaluate_model(gb_model)

Random Forest:
MAE: 12199.762132683984
RMSE: 24813.383437472996

Gradient Boosting:
MAE: 13356.02371938398
RMSE: 25726.55367473458


In [7]:
# Cross-Validation

rf_scores = cross_val_score(rf_model, X, y, cv=5, scoring='neg_root_mean_squared_error')
gb_scores = cross_val_score(gb_model, X, y, cv=5, scoring='neg_root_mean_squared_error')

print("RF CV RMSE:", -rf_scores.mean())
print("GB CV RMSE:", -gb_scores.mean())

RF CV RMSE: 29960.931813912273
GB CV RMSE: 27339.625545961222
