# Preprocessing

In [None]:
import pandas as pd

path = "/content/laptop_prices.csv"
df = pd.read_csv(path)

In [None]:
# Basic overview
print(df.shape)
print(df.head())

(1275, 23)
  Company      Product   TypeName  Inches  Ram     OS  Weight  Price_euros  \
0   Apple  MacBook Pro  Ultrabook    13.3    8  macOS    1.37      1339.69   
1   Apple  Macbook Air  Ultrabook    13.3    8  macOS    1.34       898.94   
2      HP       250 G6   Notebook    15.6    8  No OS    1.86       575.00   
3   Apple  MacBook Pro  Ultrabook    15.4   16  macOS    1.83      2537.45   
4   Apple  MacBook Pro  Ultrabook    13.3    8  macOS    1.37      1803.60   

     Screen  ScreenW  ...  RetinaDisplay CPU_company CPU_freq      CPU_model  \
0  Standard     2560  ...            Yes       Intel      2.3        Core i5   
1  Standard     1440  ...             No       Intel      1.8        Core i5   
2   Full HD     1920  ...             No       Intel      2.5  Core i5 7200U   
3  Standard     2880  ...            Yes       Intel      2.7        Core i7   
4  Standard     2560  ...            Yes       Intel      3.1        Core i5   

  PrimaryStorage  SecondaryStorage Prim

### Missing Value Check

In [None]:
print(df.isnull().sum())

Company                 0
Product                 0
TypeName                0
Inches                  0
Ram                     0
OS                      0
Weight                  0
Price_euros             0
Screen                  0
ScreenW                 0
ScreenH                 0
Touchscreen             0
IPSpanel                0
RetinaDisplay           0
CPU_company             0
CPU_freq                0
CPU_model               0
PrimaryStorage          0
SecondaryStorage        0
PrimaryStorageType      0
SecondaryStorageType    0
GPU_company             0
GPU_model               0
dtype: int64


### One-Hot Encoding

In [None]:
object_cols = df.select_dtypes(include='object').columns
df_encoded = pd.get_dummies(df, columns=object_cols, drop_first=True)
display(df_encoded.head())

Unnamed: 0,Inches,Ram,Weight,Price_euros,ScreenW,ScreenH,CPU_freq,PrimaryStorage,SecondaryStorage,Company_Apple,...,GPU_model_Radeon R7 M440,GPU_model_Radeon R7 M445,GPU_model_Radeon R7 M460,GPU_model_Radeon R7 M465,GPU_model_Radeon R9 M385,GPU_model_Radeon RX 540,GPU_model_Radeon RX 550,GPU_model_Radeon RX 560,GPU_model_Radeon RX 580,GPU_model_UHD Graphics 620
0,13.3,8,1.37,1339.69,2560,1600,2.3,128,0,True,...,False,False,False,False,False,False,False,False,False,False
1,13.3,8,1.34,898.94,1440,900,1.8,128,0,True,...,False,False,False,False,False,False,False,False,False,False
2,15.6,8,1.86,575.0,1920,1080,2.5,256,0,False,...,False,False,False,False,False,False,False,False,False,False
3,15.4,16,1.83,2537.45,2880,1800,2.7,512,0,True,...,False,False,False,False,False,False,False,False,False,False
4,13.3,8,1.37,1803.6,2560,1600,3.1,256,0,True,...,False,False,False,False,False,False,False,False,False,False


### Train-Test Split

In [None]:
# Separate features (X) and target (y)
y = df_encoded['Price_euros']
X = df_encoded.drop('Price_euros', axis=1)

print(f"Shape of X (features): {X.shape}")
print(f"Shape of y (target): {y.shape}")

Shape of X (features): (1275, 874)
Shape of y (target): (1275,)


In [None]:
from sklearn.model_selection import train_test_split

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of X_train: (1020, 874)
Shape of X_test: (255, 874)
Shape of y_train: (1020,)
Shape of y_test: (255,)


### Feature Scaling

In [None]:
numerical_cols = df.drop('Price_euros', axis=1).select_dtypes(include=['int64', 'float64']).columns
print(numerical_cols)

Index(['Inches', 'Ram', 'Weight', 'ScreenW', 'ScreenH', 'CPU_freq',
       'PrimaryStorage', 'SecondaryStorage'],
      dtype='object')


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Identify the numerical columns for scaling (excluding 'Price_euros' which is 'y')
# 'numerical_cols' variable from earlier steps contains 'Price_euros', so we'll filter it out if present in X
numeric_cols_for_scaling = [col for col in numerical_cols if col != 'Price_euros' and col in X.columns]

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Create copies to store scaled data, keeping original column structure
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Fit the scaler on the numerical columns of X_train and transform them
X_train_scaled[numeric_cols_for_scaling] = scaler.fit_transform(X_train[numeric_cols_for_scaling])

# Transform the numerical columns of X_test using the scaler fitted on X_train
X_test_scaled[numeric_cols_for_scaling] = scaler.transform(X_test[numeric_cols_for_scaling])

print("First 5 rows of X_train_scaled (numeric columns only scaled):")
display(X_train_scaled.head())

print("\nFirst 5 rows of X_test_scaled (numeric columns only scaled):")
display(X_test_scaled.head())

First 5 rows of X_train_scaled (numeric columns only scaled):


Unnamed: 0,Inches,Ram,Weight,ScreenW,ScreenH,CPU_freq,PrimaryStorage,SecondaryStorage,Company_Apple,Company_Asus,...,GPU_model_Radeon R7 M440,GPU_model_Radeon R7 M445,GPU_model_Radeon R7 M460,GPU_model_Radeon R7 M465,GPU_model_Radeon R9 M385,GPU_model_Radeon RX 540,GPU_model_Radeon RX 550,GPU_model_Radeon RX 560,GPU_model_Radeon RX 580,GPU_model_UHD Graphics 620
413,0.385542,0.096774,0.226933,0.223929,0.224138,0.592593,0.121569,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
778,0.46988,0.225806,0.314214,0.223929,0.224138,0.703704,0.247059,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
1107,0.662651,0.032258,0.351621,0.223929,0.224138,0.518519,0.121569,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
96,0.662651,0.096774,0.376559,0.223929,0.224138,0.666667,0.498039,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
309,0.662651,0.032258,0.291771,0.223929,0.224138,0.407407,0.498039,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False



First 5 rows of X_test_scaled (numeric columns only scaled):


Unnamed: 0,Inches,Ram,Weight,ScreenW,ScreenH,CPU_freq,PrimaryStorage,SecondaryStorage,Company_Apple,Company_Asus,...,GPU_model_Radeon R7 M440,GPU_model_Radeon R7 M445,GPU_model_Radeon R7 M460,GPU_model_Radeon R7 M465,GPU_model_Radeon R9 M385,GPU_model_Radeon RX 540,GPU_model_Radeon RX 550,GPU_model_Radeon RX 560,GPU_model_Radeon RX 580,GPU_model_UHD Graphics 620
1179,0.662651,0.032258,0.34414,0.0,0.0,0.518519,0.241176,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
342,0.662651,0.096774,0.351621,0.223929,0.224138,0.555556,0.498039,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
649,0.289157,0.096774,0.167082,0.223929,0.224138,0.666667,0.121569,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
772,0.46988,0.032258,0.264339,0.0,0.0,0.518519,0.058824,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
803,0.662651,0.096774,0.341646,1.0,1.0,0.592593,0.121569,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False


You can verify that only the numerical columns (`Inches`, `Ram`, `Weight`, `ScreenW`, `ScreenH`, `CPU_freq`, `PrimaryStorage`, `SecondaryStorage`) have been scaled to values between 0 and 1, while the one-hot encoded columns (like `Company_Apple`, `OS_Windows`, etc.) retain their 0 or 1 values.

In [None]:
print("Descriptive Statistics of numerical columns in X_train_scaled:")
display(X_train_scaled[numeric_cols_for_scaling].describe())

print("\nDescriptive Statistics of numerical columns in X_test_scaled:")
display(X_test_scaled[numeric_cols_for_scaling].describe())

Descriptive Statistics of numerical columns in X_train_scaled:


Unnamed: 0,Inches,Ram,Weight,ScreenW,ScreenH,CPU_freq,PrimaryStorage,SecondaryStorage
count,1020.0,1020.0,1020.0,1020.0,1020.0,1020.0,1020.0,1020.0
mean,0.59094,0.101771,0.333894,0.215987,0.219752,0.516921,0.215882,0.083444
std,0.171975,0.081994,0.164334,0.205871,0.210205,0.187514,0.178642,0.197199
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.46988,0.032258,0.201372,0.094584,0.094828,0.407407,0.121569,0.0
50%,0.662651,0.096774,0.336658,0.223929,0.224138,0.592593,0.121569,0.0
75%,0.662651,0.096774,0.401496,0.223929,0.224138,0.666667,0.247059,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0



Descriptive Statistics of numerical columns in X_test_scaled:


Unnamed: 0,Inches,Ram,Weight,ScreenW,ScreenH,CPU_freq,PrimaryStorage,SecondaryStorage
count,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0
mean,0.601843,0.112334,0.34837,0.215363,0.219788,0.530428,0.206367,0.096078
std,0.173286,0.082755,0.176568,0.171521,0.177047,0.182911,0.181497,0.225326
min,0.144578,0.0,0.029925,0.0,0.0,0.074074,0.011765,0.0
25%,0.46988,0.096774,0.23192,0.223929,0.224138,0.407407,0.121569,0.0
50%,0.662651,0.096774,0.341646,0.223929,0.224138,0.592593,0.121569,0.0
75%,0.662651,0.096774,0.426434,0.223929,0.224138,0.666667,0.247059,0.0
max,0.86747,0.483871,0.930175,1.0,1.0,1.0,1.0,1.0


### Save to CSV files

In [None]:
# Save X_train_scaled, X_test_scaled, y_train, y_test to CSV files
X_train_scaled.to_csv('X_train_scaled.csv', index=False)
X_test_scaled.to_csv('X_test_scaled.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

print("Files saved to Colab environment:")
print("- X_train_scaled.csv")
print("- X_test_scaled.csv")
print("- y_train.csv")
print("- y_test.csv")

print("\nYou can find these files in the file browser (folder icon on the left sidebar) and download them if needed, or mount Google Drive to save them there directly.")

Files saved to Colab environment:
- X_train_scaled.csv
- X_test_scaled.csv
- y_train.csv
- y_test.csv

You can find these files in the file browser (folder icon on the left sidebar) and download them if needed, or mount Google Drive to save them there directly.
