In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier   
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import learning_curve
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy.stats import entropy

from tqdm import tqdm

In [2]:
computer_prices = pd.read_csv("../data/computer_prices.csv")

In [3]:
computer_prices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 34 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID                   100000 non-null  int64  
 1   device_type          100000 non-null  object 
 2   brand                100000 non-null  object 
 3   model                100000 non-null  object 
 4   release_year         100000 non-null  int64  
 5   os                   100000 non-null  object 
 6   form_factor          100000 non-null  object 
 7   cpu_brand            100000 non-null  object 
 8   cpu_model            100000 non-null  object 
 9   cpu_tier             100000 non-null  int64  
 10  cpu_cores            100000 non-null  int64  
 11  cpu_threads          100000 non-null  int64  
 12  cpu_base_ghz         100000 non-null  float64
 13  cpu_boost_ghz        100000 non-null  float64
 14  gpu_brand            100000 non-null  object 
 15  gpu_model         

In [4]:
computer_prices[['resolution_width', 'resolution_height']] = computer_prices['resolution'].str.split('x', expand=True).astype(int)

In [5]:
computer_prices['aspect_ratio'] = np.round(computer_prices['resolution_width'] / computer_prices['resolution_height'], 2)

In [6]:
computer_prices.drop('resolution', axis=1, inplace=True)

In [7]:
computer_prices.columns

Index(['ID', 'device_type', 'brand', 'model', 'release_year', 'os',
       'form_factor', 'cpu_brand', 'cpu_model', 'cpu_tier', 'cpu_cores',
       'cpu_threads', 'cpu_base_ghz', 'cpu_boost_ghz', 'gpu_brand',
       'gpu_model', 'gpu_tier', 'vram_gb', 'ram_gb', 'storage_type',
       'storage_gb', 'storage_drive_count', 'display_type', 'display_size_in',
       'refresh_hz', 'battery_wh', 'charger_watts', 'psu_watts', 'wifi',
       'bluetooth', 'weight_kg', 'warranty_months', 'price',
       'resolution_width', 'resolution_height', 'aspect_ratio'],
      dtype='object')

In [8]:
computer_prices.duplicated().sum()

np.int64(0)

In [9]:
for col in computer_prices.select_dtypes(include=['object']).columns:
    print(f"{col}: {computer_prices[col].nunique()}")

device_type: 2
brand: 10
model: 99036
os: 4
form_factor: 10
cpu_brand: 3
cpu_model: 26971
gpu_brand: 4
gpu_model: 49
storage_type: 4
display_type: 6
wifi: 4


In [10]:
def extract_cpu_type(cpu_model):
    parts = cpu_model.replace("-", " ").split()

    if parts[0] in ["AMD", "Intel"]:
        parts = parts[:-1]
        
    parts = parts[1:]

    return " ".join(parts).strip()

In [11]:
computer_prices['cpu_type'] = computer_prices['cpu_model'].apply(extract_cpu_type)

In [12]:
computer_prices['cpu_type'].unique()

array(['i5', 'i7', 'Ryzen 5', 'Ryzen 7', 'M2', 'M2 Pro', 'i9', 'Ryzen 3',
       'M3 Pro', 'i3', 'M1 Pro', 'M1 Max', 'Ryzen 9', 'M3', 'M1',
       'M2 Max', 'M3 Max'], dtype=object)

In [13]:
computer_prices.drop(['model', 'cpu_model'], axis=1, inplace=True)

In [14]:
computer_prices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 35 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID                   100000 non-null  int64  
 1   device_type          100000 non-null  object 
 2   brand                100000 non-null  object 
 3   release_year         100000 non-null  int64  
 4   os                   100000 non-null  object 
 5   form_factor          100000 non-null  object 
 6   cpu_brand            100000 non-null  object 
 7   cpu_tier             100000 non-null  int64  
 8   cpu_cores            100000 non-null  int64  
 9   cpu_threads          100000 non-null  int64  
 10  cpu_base_ghz         100000 non-null  float64
 11  cpu_boost_ghz        100000 non-null  float64
 12  gpu_brand            100000 non-null  object 
 13  gpu_model            100000 non-null  object 
 14  gpu_tier             100000 non-null  int64  
 15  vram_gb           

In [15]:
for col in computer_prices.select_dtypes(include="object"):
    print(computer_prices[col].value_counts(normalize=True), "\n")

device_type
Laptop     0.59844
Desktop    0.40156
Name: proportion, dtype: float64 

brand
Lenovo      0.15992
HP          0.14114
Dell        0.14005
Apple       0.11915
ASUS        0.10159
Acer        0.09925
Samsung     0.08066
MSI         0.07891
Gigabyte    0.04900
Razer       0.03033
Name: proportion, dtype: float64 

os
Windows     0.71817
macOS       0.18207
Linux       0.06109
ChromeOS    0.03867
Name: proportion, dtype: float64 

form_factor
Mainstream     0.17819
Gaming         0.16876
ATX            0.15597
Ultrabook      0.13236
Micro-ATX      0.08672
Full-Tower     0.07110
2-in-1         0.07049
SFF            0.05585
Workstation    0.04864
Mini-ITX       0.03192
Name: proportion, dtype: float64 

cpu_brand
Intel    0.52774
AMD      0.35311
Apple    0.11915
Name: proportion, dtype: float64 

gpu_brand
NVIDIA    0.54712
Apple     0.18922
AMD       0.15767
Intel     0.10599
Name: proportion, dtype: float64 

gpu_model
Apple Integrated    0.18922
RTX 40 70           0.05743


In [16]:
for col in computer_prices.select_dtypes(include="object"): 
    ratio = computer_prices[col].value_counts(normalize=True).max() 
    
    print(f"{col}: {ratio:.2f} {"( Imbalanced )" if  ratio > 0.75 else ""}")

device_type: 0.60 
brand: 0.16 
os: 0.72 
form_factor: 0.18 
cpu_brand: 0.53 
gpu_brand: 0.55 
gpu_model: 0.19 
storage_type: 0.45 
display_type: 0.32 
wifi: 0.46 
cpu_type: 0.25 


In [17]:
for col in computer_prices.select_dtypes(include="object"):
    dist = computer_prices[col].value_counts(normalize=True)
    ent = entropy(dist)
    
    print(f"{col}: {ent:.2f} {"( Imbalanced )" if  ent < 1.0 else ""}")

device_type: 0.67 ( Imbalanced )
brand: 2.22 
os: 0.84 ( Imbalanced )
form_factor: 2.17 
cpu_brand: 0.96 ( Imbalanced )
gpu_brand: 1.17 
gpu_model: 3.27 
storage_type: 1.27 
display_type: 1.66 
wifi: 1.23 
cpu_type: 2.21 


In [18]:
numeric_cols = computer_prices.select_dtypes(include=['int64', 'float64']).columns
object_cols = computer_prices.select_dtypes(include=['object']).columns

scaler = StandardScaler()
computer_prices[numeric_cols] = scaler.fit_transform(computer_prices[numeric_cols])

computer_prices = pd.get_dummies(computer_prices, columns=object_cols)

In [19]:
computer_prices.dtypes

ID                  float64
release_year        float64
cpu_tier            float64
cpu_cores           float64
cpu_threads         float64
                     ...   
cpu_type_Ryzen 9       bool
cpu_type_i3            bool
cpu_type_i5            bool
cpu_type_i7            bool
cpu_type_i9            bool
Length: 137, dtype: object

In [20]:
X = computer_prices.drop('price', axis=1)
y = computer_prices['price']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [21]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    s = sigmoid(x)
    return s * (1 - s)

def linear(x):
    return x

def mse(y, Y):
    return np.mean((y - Y) ** 2)

def mse_grad(y, Y):
    return (y - Y) / len(Y)

In [23]:
class MLP:
    def __init__(self, n_input, n_hidden):
        self.w1 = np.random.randn(n_input, n_hidden)
        self.b1 = np.zeros((1, n_hidden))
        self.w2 = np.random.randn(n_hidden, 1)
        self.b2 = np.zeros((1, 1))

    def forward_pass(self, X):
        self.z1 = X.dot(self.w1) + self.b1
        self.a1 = sigmoid(self.z1)
        self.z2 = self.a1.dot(self.w2) + self.b2
        return linear(self.z2)

    def train(self, X_train, y_train, X_val, y_val, epochs, learning_rate=0.5, batch_size=8):
        n_samples = X_train.shape[0]
        
        pbar = tqdm(range(epochs), desc="Training", colour="magenta", bar_format="{l_bar}{bar:25}{r_bar}")

        for e in pbar:
            indices = np.random.permutation(n_samples)
            
            X_shuffled = X_train[indices]
            y_shuffled = y_train[indices]

            for start in range(0, n_samples, batch_size):
                end = start + batch_size

                X_batch = X_shuffled[start:end]
                y_batch = y_shuffled[start:end]

                y_pred = self.forward_pass(X_batch)
                error = mse_grad(y_pred, y_batch)

                dw2 = self.a1.T.dot(error)
                db2 = np.sum(error, axis=0, keepdims=True)

                da1 = error.dot(self.w2.T)
                dz1 = da1 * sigmoid_derivative(self.z1)
                dw1 = X_batch.T.dot(dz1)
                db1 = np.sum(dz1, axis=0, keepdims=True)

                self.w2 -= learning_rate * dw2
                self.b2 -= learning_rate * db2
                self.w1 -= learning_rate * dw1
                self.b1 -= learning_rate * db1

            train_loss = mse(self.forward_pass(X_train), y_train)
            postfix = {"train_loss": train_loss}

            val_loss = mse(self.forward_pass(X_val), y_val)
            postfix["val_loss"] = val_loss

            if e % 1000 == 0:
                pbar.set_postfix(postfix)

In [24]:
X_train_np = X_train.values.astype(np.float64)
y_train_np = y_train.values.reshape(-1, 1).astype(np.float64) 
X_val_np = X_val.values.astype(np.float64)
y_val_np = y_val.values.reshape(-1, 1).astype(np.float64)

mlp = MLP(n_input=X_train_np.shape[1], n_hidden=8)
mlp.train(X_train_np, y_train_np, X_val=X_val_np, y_val=y_val_np, epochs=1000, learning_rate=0.5, batch_size=8)

Training: 100%|[35m█████████████████████████[0m| 1000/1000 [11:18<00:00,  1.47it/s, train_loss=0.191, val_loss=0.149][0m


In [25]:
X_test_np = X_test.values.astype(np.float64)
y_test_np = y_test.values.reshape(-1, 1).astype(np.float64)

y_pred_test = mlp.forward_pass(X_test_np)
test_loss = mse(y_pred_test, y_test_np)

print("MSE:", test_loss)

MSE: 0.14817668356863953
