In [1]:
import os
from pathlib import Path
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.preprocessing import StandardScaler , MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from  xgboost import XGBClassifier

from sklearn.metrics import accuracy_score , classification_report , confusion_matrix


# 1. Loading the Dataset

In [2]:
os.getcwd() # getiing the current working directory

'e:\\Laptop_Price_Predictor\\Laptop_Price_Predictor\\Experiment'

In [3]:
os.chdir("../") #changing the directory to one folder back 

In [4]:
BASE_PATH = os.getcwd() #now checking the changed current directory

In [5]:
DATA_PATH =os.path.join(BASE_PATH  , "Data" , "Raw" , "laptop_data.csv")

In [6]:
os.getcwd()

'e:\\Laptop_Price_Predictor\\Laptop_Price_Predictor'

In [16]:
df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,1,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,2,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808


In [17]:
df.drop(columns=["Unnamed: 0"] , axis=1 , inplace = True)


# 2. Basic Inspection

In [20]:
# Basic Inspection
print("--- Initial Info ---")
print(df.info())

--- Initial Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Company           1303 non-null   object 
 1   TypeName          1303 non-null   object 
 2   Inches            1303 non-null   float64
 3   ScreenResolution  1303 non-null   object 
 4   Cpu               1303 non-null   object 
 5   Ram               1303 non-null   object 
 6   Memory            1303 non-null   object 
 7   Gpu               1303 non-null   object 
 8   OpSys             1303 non-null   object 
 9   Weight            1303 non-null   object 
 10  Price             1303 non-null   float64
dtypes: float64(2), object(9)
memory usage: 112.1+ KB
None


In [22]:
df.isnull().sum()

Company             0
TypeName            0
Inches              0
ScreenResolution    0
Cpu                 0
Ram                 0
Memory              0
Gpu                 0
OpSys               0
Weight              0
Price               0
dtype: int64

### 2.1 Clean the Columns

#### Ram and Weights 

In [30]:
if df["Ram"].dtype == "object":
    df["Ram"] = df["Ram"].str.replace("GB" , " ").astype("int32")
if df["Weight"].dtype == "object":
    df["Weight"] = df["Weight"].str.replace("kg" , " ").astype("float32")
    
print(df["Ram"].dtype , df["Weight"].dtype)

int32 float32


#### ScreenResolution 

In [62]:
#creating new features like TouchScreen and IPS from the ScreenResolution Column

df["TouchScreen"] = df["ScreenResolution"].apply(lambda x: 1  if "TouchScreen" in x else 0)
df["IPS"] = df["ScreenResolution"].apply(lambda x : 1 if "IPS" in x else 0)

# Extract The ScreenResolution 
new = df["ScreenResolution"].str.split("x" )


In [48]:
# Extract The ScreenResolution 
new = df["ScreenResolution"].str.split("x" , expand = True , n=1)

In [60]:
# Extract resolution
new = df["ScreenResolution"].str.split("x" , n=1 , expand = True)
df["X_res"] = new[0].str.extract('(\d+)').astype("int32")
df["Y_res"] = new[1].astype("int32")

  df["X_res"] = new[0].str.extract('(\d+)').astype("int32")


#### CPU

In [64]:
# CPU Processor 
def fetch_processor(text):
    words = text.split()
    if words[0:3] == ['Intel', 'Core', 'i7'] or words[0:3] == ['Intel', 'Core', 'i5'] or words[0:3] == ['Intel', 'Core', 'i3']:
        return " ".join(words[0:3])
    else:
        if words[0] == 'Intel':
            return 'Other Intel Processor'
        else:
            return 'AMD Processor'
        
df['Cpu Brand'] = df['Cpu'].apply(fetch_processor)

#### Memory

In [65]:
df['Memory'] = df['Memory'].astype(str).replace('\.0', '', regex=True)
df["Memory"] = df["Memory"].str.replace('GB', '')
df["Memory"] = df["Memory"].str.replace('TB', '000')
new = df["Memory"].str.split("+", n=1, expand=True)

df["first"] = new[0].str.strip()
df["second"] = new[1].fillna("0")

df["Layer1HDD"] = df["first"].apply(lambda x: 1 if "HDD" in x else 0)
df["Layer1SSD"] = df["first"].apply(lambda x: 1 if "SSD" in x else 0)
df["Layer1Hybrid"] = df["first"].apply(lambda x: 1 if "Hybrid" in x else 0)
df["Layer1Flash_Storage"] = df["first"].apply(lambda x: 1 if "Flash Storage" in x else 0)

df['first'] = df['first'].str.extract('(\d+)').astype(int)

df["Layer2HDD"] = df["second"].apply(lambda x: 1 if "HDD" in x else 0)
df["Layer2SSD"] = df["second"].apply(lambda x: 1 if "SSD" in x else 0)
df["Layer2Hybrid"] = df["second"].apply(lambda x: 1 if "Hybrid" in x else 0)
df["Layer2Flash_Storage"] = df["second"].apply(lambda x: 1 if "Flash Storage" in x else 0)

df['second'] = df['second'].str.extract('(\d+)').astype(int)

df["HDD"]=(df["first"]*df["Layer1HDD"]+df["second"]*df["Layer2HDD"])
df["SSD"]=(df["first"]*df["Layer1SSD"]+df["second"]*df["Layer2SSD"])
df["Hybrid"]=(df["first"]*df["Layer1Hybrid"]+df["second"]*df["Layer2Hybrid"])
df["Flash_Storage"]=(df["first"]*df["Layer1Flash_Storage"]+df["second"]*df["Layer2Flash_Storage"])

  df['Memory'] = df['Memory'].astype(str).replace('\.0', '', regex=True)
  df['first'] = df['first'].str.extract('(\d+)').astype(int)
  df['second'] = df['second'].str.extract('(\d+)').astype(int)


In [66]:
df.drop(columns=['first', 'second', 'Layer1HDD', 'Layer1SSD', 'Layer1Hybrid',
       'Layer1Flash_Storage', 'Layer2HDD', 'Layer2SSD', 'Layer2Hybrid',
       'Layer2Flash_Storage'],inplace=True)

#### GPU

In [67]:
df['Gpu Brand'] = df['Gpu'].apply(lambda x: x.split()[0])
df = df[df['Gpu Brand'] != 'ARM'] # Remove rare ARM Gpu

#### OS

In [68]:
def cat_os(inp):
    if inp == 'Windows 10' or inp == 'Windows 7' or inp == 'Windows 10 S':
        return 'Windows'
    elif inp == 'macOS' or inp == 'Mac OS X':
        return 'Mac'
    else:
        return 'Others/No OS/Linux'

df['os'] = df['OpSys'].apply(cat_os)

#### Drop Redundant Columns

In [69]:
df_cleaned = df.drop(columns=['ScreenResolution', 'Cpu', 'Memory', 'Gpu', 'OpSys'])

In [None]:
df_cleaned.columns

Index(['Company', 'TypeName', 'Inches', 'Ram', 'Weight', 'Price', 'X_res',
       'Y_res', 'TouchScreen', 'IPS', 'Cpu Brand', 'HDD', 'SSD', 'Hybrid',
       'Flash_Storage', 'Gpu Brand', 'os'],
      dtype='object')

In [74]:
df_cleaned.sample(5)

Unnamed: 0,Company,TypeName,Inches,Ram,Weight,Price,X_res,Y_res,TouchScreen,IPS,Cpu Brand,HDD,SSD,Hybrid,Flash_Storage,Gpu Brand,os
1226,Dell,Ultrabook,13.3,8,1.29,78694.56,3200,1800,0,0,Intel Core i7,0,256,0,0,Intel,Windows
601,HP,Notebook,15.6,4,2.1,35616.6144,1366,768,0,0,Intel Core i3,500,0,0,0,Intel,Windows
1126,Lenovo,2 in 1 Convertible,10.1,4,0.69,25521.12,1920,1200,0,1,Other Intel Processor,0,0,0,64,Intel,Windows
1067,Dell,2 in 1 Convertible,13.3,8,1.62,42517.9728,1920,1080,0,0,Intel Core i7,0,256,0,0,Intel,Windows
823,MSI,Gaming,15.6,8,2.4,63882.72,1920,1080,0,0,Intel Core i5,0,256,0,0,Nvidia,Windows


In [78]:
# Create Version 1 (Preprocessed without PPI engineering)
# Note: We still have X_res, Y_res, Inches which are the raw features.
v1_data = df_cleaned.copy()

# 3. Feature Engineering

In [80]:
# Feature Engineering for Version 2 (PPI)
df_cleaned['ppi'] = (((df_cleaned['X_res']**2) + (df_cleaned['Y_res']**2))**0.5 / df_cleaned['Inches']).astype('float')
v2_data = df_cleaned.drop(columns=['Inches', 'X_res', 'Y_res'])

# Final check
print("########### V1  and V2 Shape ###############")
print("V1 Shape:", v1_data.shape)
print("V2 Shape:", v2_data.shape)


########### V1  and V2 Shape ###############
V1 Shape: (1302, 18)
V2 Shape: (1302, 15)


# 4. Saving the Cleaned Dataset

In [None]:

v1_data.to_csv('Data/Processed/v1_preprocessed.csv', index=False) #18 columns without feature engineering
v2_data.to_csv('Data/Processed/v2_engineered.csv', index=False) #15 columns with engineered features