In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/laptop-price-explorer-the-ml-model/laptops.csv


In [2]:
df = pd.read_csv('/kaggle/input/laptop-price-explorer-the-ml-model/laptops.csv')
# lowercase col names
df.columns = [col.lower() for col in df.columns]
df.head()

Unnamed: 0,companyname,typeoflaptop,inches,screenresolution,cpu,ram,memory,gpu,opsys,weight,price
0,MSI,Business Laptop,17.04068,IPS Panel Retina Display 2560x1600,Intel Core i7,12GB,512GB SSD,Intel Iris Xe Graphics,Linux,2.064834,35844.099371
1,Chuwi,2 in 1 Convertible,16.542395,Full HD,Intel Core i5,12GB,128GB PCIe SSD,Intel Iris Xe Graphics,No OS,4.060656,37019.059051
2,hp,WorkStation,17.295294,Full HD,Intel Xeon E3-1505M,8GB,1TB HDD,Intel Iris Xe Graphics,Linux,2.901689,33329.360341
3,MSI,2 in 1 Convertible,11.526203,2K,Intel Core i7,16GB,512GB NVMe SSD,Intel Iris Xe Graphics,Windows 10,2.914843,68631.102486
4,Microsoft,Gaming,12.649634,Full HD,Intel Core i5,8GB,512GB SSD,AMD Radeon RX 5600M,Windows 10,4.341995,33842.479566


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   companyname       1000 non-null   object 
 1   typeoflaptop      1000 non-null   object 
 2   inches            1000 non-null   float64
 3   screenresolution  1000 non-null   object 
 4   cpu               1000 non-null   object 
 5   ram               1000 non-null   object 
 6   memory            1000 non-null   object 
 7   gpu               1000 non-null   object 
 8   opsys             1000 non-null   object 
 9   weight            1000 non-null   float64
 10  price             1000 non-null   float64
dtypes: float64(3), object(8)
memory usage: 86.1+ KB


# Initial data notes
## Column changes
- ~Change ram column name to include GB and make numerical~
- ~Break apart memory into:~
    - ~numerical size (some are GB, some TB)~
    - ~SSD/HDD~
    - ~(optional) additional info - PCIe, NVMe, etc~
- ~GPU/CPU columns~
    - ~Extract make, model into separate columns~
- ~Screen resolution~
    - ~try to convert to pixel height/width columns or combined resolution if few variations~
- Find out how many CompanyName/TypOfLaptop unique values there are

In [4]:
# break apart memory column
df['ram'].drop_duplicates()

0     12GB
2      8GB
3     16GB
11     4GB
Name: ram, dtype: object

In [5]:
df['ram_gb'] = df['ram'].str.extract('(\d+)').astype(int)
df = df.drop('ram', axis=1)
df['ram_gb'].drop_duplicates()

0     12
2      8
3     16
11     4
Name: ram_gb, dtype: int64

In [6]:
# break apart memory into numerical size - 1st extract numerical, 2nd if numerical 1, calculate GB
df['memory'].head()

0         512GB SSD
1    128GB PCIe SSD
2           1TB HDD
3    512GB NVMe SSD
4         512GB SSD
Name: memory, dtype: object

In [7]:
df['storage_gb'] = df['memory'].str.extract('(\d+)').astype(int)
df.loc[df['storage_gb'] == 1, 'storage_gb'] = 1000

In [8]:
# take last 3 chars and put into hdd_type col
df['storage_type'] = df['memory'].str.extract('(\w+$)')

In [9]:
# extract PCIe and NVMe info
df.loc[df['memory'].str.contains('PCIe'), 'storage_controller'] = 'PCIe'
df.loc[df['memory'].str.contains('NVMe'), 'storage_controller'] = 'NVMe'
df = df.drop('memory', axis=1)

In [10]:
# GPU - Extract make, model into separate columns
df['gpu'].drop_duplicates()

0     Intel Iris Xe Graphics
4        AMD Radeon RX 5600M
5    NVIDIA GeForce GTX 1650
Name: gpu, dtype: object

In [11]:
df[['gpu_make', 'gpu_model']] = df['gpu'].str.split(" ", n=1, expand=True)

In [12]:
df['screenresolution'].drop_duplicates()

0            IPS Panel Retina Display 2560x1600
1                                       Full HD
3                                            2K
5                                 HD 1920x1080 
7     IPS Panel Full HD / Touchscreen 1920x1080
16                                           4K
Name: screenresolution, dtype: object

# resolution notes
- 2K = 2048×1080
- Full HD = 1920x1080
- 4K = 4096×2160

In [13]:
# create resolution dictionary for mapping values to columns
hd_resolution = {
    'IPS Panel Retina Display 2560x1600': [2560, 1600],
    'Full HD': [1920, 1080],
    '2K': [2048, 1080],
    'HD 1920x1080 ': [1920, 1080],
    'IPS Panel Full HD / Touchscreen 1920x1080': [1920, 1080],
    '4K': [4096, 2160]
}

# map values to new columns
df['screen_resolution'] = df['screenresolution'].map(hd_resolution)
df[['h_screen_resolution', 'v_screen_resolution']] = list(df['screen_resolution'])

# add in IPS info
ips_true_false = {
    'IPS Panel Retina Display 2560x1600': True,
    'Full HD': False,
    '2K': False,
    'HD 1920x1080 ': False,
    'IPS Panel Full HD / Touchscreen 1920x1080': True,
    '4K': False
}
df['ips'] = df['screenresolution'].map(ips_true_false)

# TODO: add in touchscreen info
touchscreen_true_false = {
    'IPS Panel Retina Display 2560x1600': False,
    'Full HD': False,
    '2K': False,
    'HD 1920x1080 ': False,
    'IPS Panel Full HD / Touchscreen 1920x1080': True,
    '4K': False
}
df['touchscreen'] = df['screenresolution'].map(touchscreen_true_false)

# drop columns
df = df.drop(['screen_resolution'], axis=1)
df = df.drop(['screenresolution'], axis=1)

In [14]:
df.head()

Unnamed: 0,companyname,typeoflaptop,inches,cpu,gpu,opsys,weight,price,ram_gb,storage_gb,storage_type,storage_controller,gpu_make,gpu_model,h_screen_resolution,v_screen_resolution,ips,touchscreen
0,MSI,Business Laptop,17.04068,Intel Core i7,Intel Iris Xe Graphics,Linux,2.064834,35844.099371,12,512,SSD,,Intel,Iris Xe Graphics,2560,1600,True,False
1,Chuwi,2 in 1 Convertible,16.542395,Intel Core i5,Intel Iris Xe Graphics,No OS,4.060656,37019.059051,12,128,SSD,PCIe,Intel,Iris Xe Graphics,1920,1080,False,False
2,hp,WorkStation,17.295294,Intel Xeon E3-1505M,Intel Iris Xe Graphics,Linux,2.901689,33329.360341,8,1000,HDD,,Intel,Iris Xe Graphics,1920,1080,False,False
3,MSI,2 in 1 Convertible,11.526203,Intel Core i7,Intel Iris Xe Graphics,Windows 10,2.914843,68631.102486,16,512,SSD,NVMe,Intel,Iris Xe Graphics,2048,1080,False,False
4,Microsoft,Gaming,12.649634,Intel Core i5,AMD Radeon RX 5600M,Windows 10,4.341995,33842.479566,8,512,SSD,,AMD,Radeon RX 5600M,1920,1080,False,False
