## Laptop Price Prediction (Data Cleaning)

### Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

from pathlib import Path

### Load Data

In [None]:
dataset = pd.read_csv(Path(os.getcwd()).parent / 'data' / 'dataset.csv')

In [None]:
dataset.head()

In [None]:
# Print null value counts of all columns in dataset, print all columns with null values greater than 0
def get_null_cols(dataset):
   cls = list(dataset.columns)
   nvs = list(dataset.isnull().sum())
   for i in range(len(cls)):
      if nvs[i] > 0:
         print(cls[i], nvs[i])
   return cls, nvs

In [None]:
# Get All Columns whose null value count is greater than 1100
cls,nvs = get_null_cols(dataset)
cols = ['Manufacturer', 'Series', 'Batteries Included', 'Batteries Required', 'Battery cell composition', 'Device type', 'Package Dimensions', 'Mounting Hardware', 'Product Dimensions', 'Voltage', 'Wattage', 'Model', 'Model Name', 'Model Year', 'Power Source', 'Optical Drive Type', 'Are Batteries Included', 'Number of Lithium Ion Cells', 'Lithium Battery Energy Content', 'Item model number', 'Graphics Card Description', 'Graphics RAM Type', 'Graphics Card Interface', 'Connectivity Type', 'Included Components', 'Computer Memory Type', 'Speaker Description', 'Wireless Type', 'Ram Memory Installed Size', 'RAM memory maximum size', 'Ram Memory Technology', 'Processor model number', 'Hardware Interface', 'Hardware Platform', 'Country of Origin', 'Rear Webcam Resolution', 'Hard Disk Rotational Speed', "Memory Storage Capacity", "Chipset Type", "Graphics Coprocessor", "Number of items", 'Flash Memory Installed Size', 'Total USB ports', 'Audio Output Type', 'Item Height', 'Item Width']
for i in range(len(cls)):
   if nvs[i] > 1100:
      cols.append(cls[i])

In [None]:
dataset = dataset.drop(cols, axis=1)

In [None]:
def standardize_color(color: str):
    if color == np.nan or 'black' in color.lower() or 'carbon' in color.lower():
        return 'Black'
    elif 'silver' in color.lower() or 'moon' in color.lower():
        return 'Silver'
    elif 'grey' in color.lower() or 'gray' in color.lower() or 'graphite' in color.lower():
        return 'Grey'
    elif 'blue' in color.lower():
        return 'Blue'
    return color.split(' ')[-1].capitalize()

In [None]:
dataset.Colour = dataset.Colour.fillna('Black')
dataset.Colour = dataset.Colour.apply(standardize_color)

In [None]:
def process_ff(ff: str):
    ff = ff.lower()
    if ff == 'thin and light' or ff == 'thin & light' or ff == 'thin & light laptop':
        return 'thin and light'
    elif ff == 'laptop, chromebook':
        return 'chromebook'
    elif ff == 'gaming laptop' or ff == 'gaming':
        return 'gaming'
    return ff

In [None]:
# Drop all records who have Form Factor as Table Stand
dataset = dataset[dataset['Form Factor'] != 'Table Stand']

dataset['Form Factor'] = dataset['Form Factor'].fillna('laptop')
dataset['Form Factor'] = dataset['Form Factor'].apply(process_ff)

In [None]:
dataset['Form Factor'].unique()

In [None]:
dataset.rename(columns={'Standing screen display size': "Screen Resolution"})

In [None]:
dataset['Screen Resolution'] = dataset['Screen Resolution'].fillna('720p')

In [None]:
dataset['Screen Resolution'].unique()

In [None]:
def process_screen_res(res: str):
    if '720p' in res:
        return 1280,720
    elif '1080p' in res:
        return 1920,1080
    elif 'x' in res:
        x,y = res.split('x', maxsplit=1)
        y = y.lstrip(' ').split(' ')[0].strip('_')
        x = x.strip('_').strip(' ')
        if '_' in y:
            y = y.split('_')[0]
        return int(x), int(y)
    elif '*' in res:
        x,y = res.split('*', maxsplit=1)
        y = y.lstrip(' ').split(' ')[0].strip('_')
        x = x.strip('_').strip(' ')
        return int(x), int(y)
    return int(res), int(res)

In [None]:
X,Y = [],[]
for i in range(len(dataset)):
    x,y = process_screen_res(dataset['Screen Resolution'].iloc[i])
    X.append(x)
    Y.append(y)

dataset['Screen_Resolution_X'] = X
dataset['Screen_Resolution_Y'] = Y
dataset.drop('Screen Resolution', axis=1, inplace=True)
dataset.drop('Standing screen display size', axis=1, inplace=True)
dataset.drop('Resolution', axis=1, inplace=True)
dataset.drop('Batteries', axis=1, inplace=True)

In [None]:
def process_proc_type(tp: str):
    tp1 = tp.lower()
    if 'core i3' in tp1:
        return 'Core i3'
    elif 'core i5' in tp1:
        return 'Core i5'
    elif 'core i7' in tp1:
        return 'Core i7'
    elif 'core i9' in tp1:
        return 'Core i9'
    elif 'celeron' in tp1:
        return 'Celeron'
    elif 'ryzen 3' in tp1:
        return 'Ryzen 3'
    elif 'ryzen 5' in tp1:
        return 'Ryzen 5'
    elif 'ryzen 7' in tp1:
        return 'Ryzen 7'
    elif 'ryzen 9' in tp1:
        return 'Ryzen 9'
    elif 'athlon' in tp1:
        return 'Athlon'
    elif 'others' in tp1 or 'other' in tp1:
        return 'Other'
    return tp

In [None]:
dataset['Processor Brand'] = dataset['Processor Brand'].fillna('Other')
dataset['Processor Type'] = dataset['Processor Type'].fillna('Other')
dataset['Processor Type'] = dataset['Processor Type'].apply(process_proc_type)

In [None]:
def process_proc_speed(sp):
    if isinstance(sp, str):
        return float(sp.split(' ')[0])
    return float(sp)

In [None]:
dataset['Processor Speed'] = dataset['Processor Speed'].astype('object')
dataset['Processor Speed'] = dataset['Processor Speed'].fillna('0.0')
dataset['Processor Speed'] = dataset['Processor Speed'].apply(process_proc_speed)
dataset['Processor Speed'] = dataset['Processor Speed'].replace(0.0, round(dataset['Processor Speed'].mean(), 2))

In [None]:
dataset['Processor Count'] = dataset['Processor Count'].fillna(1)

In [None]:
dataset['RAM Size'] = dataset['RAM Size'].fillna('8')
dataset['RAM Size'] = dataset['RAM Size'].astype('object')
dataset['RAM Size'] = dataset['RAM Size'].apply(lambda x: int(x.split(' ')[0]))

In [None]:
dataset['Memory Technology'] = dataset['Memory Technology'].fillna('DDR4')
dataset['Memory Technology'] = dataset['Memory Technology'].apply(lambda x: 'LPDDR5' if 'lpddr 5' in x.lower() else x)

In [None]:
dataset = dataset.drop('Maximum Memory Supported', axis=1)

In [None]:
def process_mem_speed(spd):
    if isinstance(spd, str):
        if 'ghz' in spd.lower():
            speed = spd.split(' ')[0]
            if len(speed) != 4:
                return int(float(speed)*1000)
            return int(speed)
        return int(float(spd.split(' ')[0]))
    return int(spd)

In [None]:
dataset['Memory Clock Speed'].unique()
dataset['Memory Clock Speed'] = dataset['Memory Clock Speed'].fillna('2666')
dataset['Memory Clock Speed'] = dataset['Memory Clock Speed'].apply(process_mem_speed)

In [None]:
dataset['Memory Clock Speed'].unique()

In [None]:
def process_hdd_size(size):
    if isinstance(size, str):
        if 'tb' in size.lower():
            return int(size.split(' ')[0])*1000
        return int(size.split(' ')[0])
    return int(size)

In [None]:
dataset['Hard Drive Size'] = dataset['Hard Drive Size'].fillna('256 GB')
dataset['Hard Drive Size'] = dataset['Hard Drive Size'].apply(process_hdd_size)
dataset['Hard Drive Size'].unique()

In [None]:
dataset['Hard Disk Description'].unique()

In [None]:
dataset['Hard Disk Description'] = dataset['Hard Disk Description'].fillna('HDD')
dataset['is_SSD'] = dataset['Hard Disk Description'].apply(lambda x: 1 if 'ssd' in x.lower() or 'sshd' in x.lower() else 0)
dataset['is_HDD'] = dataset['Hard Disk Description'].apply(lambda x: 1 if 'hdd' in x.lower() or 'sshd' in x.lower() else 0)
dataset = dataset.drop('Hard Disk Description', axis=1)

In [None]:
dataset = dataset.drop('Hard Drive Interface', axis=1)

In [None]:
dataset['Audio Details'] = dataset['Audio Details'].fillna('Headphones, Speakers')
dataset['HeadphoneJack'] = dataset['Audio Details'].apply(lambda x: 1 if 'headphone' in x.lower() else 0)
dataset = dataset.drop('Audio Details', axis=1)

In [None]:
dataset['Connector Type'] = dataset['Connector Type'].fillna('Wi-Fi, USB, Bluetooth')
dataset['Wifi'] = dataset['Connector Type'].apply(lambda x: 1 if 'wi-fi' in x.lower() else 0)
dataset['Bluetooth'] = dataset['Connector Type'].apply(lambda x: 1 if 'bluetooth' in x.lower() else 0)
dataset['HDMI'] = dataset['Connector Type'].apply(lambda x: 1 if 'hdmi' in x.lower() else 0)
dataset['USB-C'] = dataset['Connector Type'].apply(lambda x: 1 if 'usb-c' in x.lower() else 0)
dataset['Ethernet'] = dataset['Connector Type'].apply(lambda x: 1 if 'ethernet' in x.lower() else 0)
dataset['Thunderbolt'] = dataset['Connector Type'].apply(lambda x: 1 if 'thunderbolt' in x.lower() else 0)
dataset = dataset.drop('Connector Type', axis=1)

In [None]:
dataset['Graphics Chipset Brand'].unique()
dataset['Graphics Chipset Brand'] = dataset['Graphics Chipset Brand'].fillna('Integrated')
dataset['DedicatedGraphics'] = dataset['Graphics Chipset Brand'].apply(lambda x: 1 if 'nvidia' in x.lower() or 'iris' in x.lower() else 0)
dataset['IntegratedGraphics'] = dataset['Graphics Chipset Brand'].apply(lambda x: 1 if 'intel' in x.lower() or 'amd' in x.lower() or 'iris' in x.lower() or 'integrated' in x.lower() else 0)
dataset.drop('Graphics Chipset Brand', axis=1, inplace=True)

In [None]:
dataset['Number of USB 2.0 Ports'] = dataset['Number of USB 2.0 Ports'].fillna(0)
dataset['Number of USB 3.0 Ports'] = dataset['Number of USB 3.0 Ports'].fillna(0)
dataset['Number of HDMI Ports'] = dataset['Number of HDMI Ports'].fillna(0)

In [None]:
dataset['Operating System'] = dataset['Operating System'].fillna('Windows')
dataset['Operating System'] = dataset['Operating System'].apply(lambda x: 'Windows' if 'windows' in x.lower() else 'MacOS' if 'macos' in x.lower() else 'ChromeOS' if 'chrome' in x.lower() else 'Linux' if 'linux' in x.lower() else 'Other')
dataset['Operating System'].unique()

In [None]:
def process_graphics_ram(gram):
    if isinstance(gram, str):
        if 'gb' in gram.lower():
            size = gram.split(' ')[0]
            if len(size) == 3:
                return float(size)
            return float(size)*1024
        return float(gram.split(' ')[0])
    return float(gram)*1024

In [None]:
dataset['Graphics Card Ram Size'].unique()
dataset['Graphics Card Ram Size'] = dataset['Graphics Card Ram Size'].fillna(dataset['RAM Size'])
dataset['Graphics Card Ram Size'] = dataset['Graphics Card Ram Size'].apply(process_graphics_ram)

In [None]:
dataset.rename({'Graphics Card Ram Size': 'GraphicsCardRAM'}, inplace=True, axis=1)

In [None]:
dataset['Display Type'].unique()

In [None]:
dataset['Display Type'] = dataset['Display Type'].fillna('LCD')

In [None]:
dataset['Average Battery Life (in hours)'].unique()
dataset['Average Battery Life (in hours)'] = dataset['Average Battery Life (in hours)'].fillna('0')
dataset['Average Battery Life (in hours)'] = dataset['Average Battery Life (in hours)'].apply(lambda x: float(x.split(' ')[0]))
dataset['Average Battery Life (in hours)'] = dataset['Average Battery Life (in hours)'].replace(0.0, round(dataset['Average Battery Life (in hours)'].mean(), 2))
dataset.rename(columns={'Average Battery Life (in hours)': 'BatteryLife'}, inplace=True)

In [None]:
dataset['Software included'].unique()

In [None]:
dataset['Special Features'] = dataset['Special Features'].fillna('')
dataset['Touchscreen'] = dataset['Special Features'].apply(lambda x: 1 if 'touchscreen' in x.lower() else 0)
dataset['Fingerprint'] = dataset['Special Features'].apply(lambda x: 1 if 'fingerprint' in x.lower() else 0)
dataset['Webcam'] = dataset['Special Features'].apply(lambda x: 1 if 'webcam' in x.lower() or 'camera' in x.lower() else 0)
dataset['SDCard'] = dataset['Special Features'].apply(lambda x: 1 if 'memory card' in x.lower() else 0)
dataset = dataset.drop('Special Features', axis=1)

In [None]:
dataset['Software included'] = dataset['Software included'].fillna('')
dataset['MSOffice'] = dataset['Software included'].apply(lambda x: 1 if 'office' in x.lower() else 0)
dataset['Antivirus'] = dataset['Software included'].apply(lambda x: 1 if 'antivirus' in x.lower() or 'security' in x.lower() or 'mcafee' in x.lower() else 0)
dataset['XboxGamePass'] = dataset['Software included'].apply(lambda x: 1 if 'xbox' in x.lower() else 0)
dataset = dataset.drop('Software included', axis=1)

In [None]:
dataset['Keyboard Description'] = dataset['Keyboard Description'].fillna('Standard')
dataset['BacklitKeyboard'] = dataset['Keyboard Description'].apply(lambda x: 1 if 'backlit' in x.lower() else 0)
dataset['RGBKeyboard'] = dataset['Keyboard Description'].apply(lambda x: 1 if 'rgb' in x.lower() else 0)
dataset = dataset.drop('Keyboard Description', axis=1)

In [None]:
dataset['Device interface - primary'] = dataset['Device interface - primary'].fillna('Keyboard, Microphone')
dataset['Touchscreen'] = dataset['Device interface - primary'].apply(lambda x: 1 if 'touchscreen' in x.lower() else 0)
dataset['Stylus'] = dataset['Device interface - primary'].apply(lambda x: 1 if 'stylus' in x.lower() else 0)
dataset['Microphone'] = dataset['Device interface - primary'].apply(lambda x: 1 if 'microphone' in x.lower() else 0)
dataset['Numpad'] = dataset['Device interface - primary'].apply(lambda x: 1 if 'numeric keypad' in x.lower() else 0)
dataset = dataset.drop('Device interface - primary', axis=1)

In [None]:
def process_price(p: str):
    if isinstance(p, str):
        return float(p.replace(',', ''))
    return float(p)

In [None]:
dataset['Price'] = dataset['Price'].apply(process_price)

### Final Cleaned Dataset

In [None]:
dataset.head()