# Data Cleaning with Pandas

In [1]:
import pandas as pd

In [2]:
laptops = pd.read_csv('https://raw.githubusercontent.com/aaronsang/guided-data-science-projects/master/data/laptops.csv')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 4: invalid continuation byte

Encodings are systems for representing characters in binary. This data set needs to be encoded using 'Latin-1'.

In [3]:
laptops = pd.read_csv('https://raw.githubusercontent.com/aaronsang/guided-data-science-projects/master/data/laptops.csv', encoding='Latin-1')

In [4]:
print(laptops.head())

Manufacturer   Model Name   Category Screen Size  \
0        Apple  MacBook Pro  Ultrabook       13.3"   
1        Apple  Macbook Air  Ultrabook       13.3"   
2           HP       250 G6   Notebook       15.6"   
3        Apple  MacBook Pro  Ultrabook       15.4"   
4        Apple  MacBook Pro  Ultrabook       13.3"   

                               Screen                         CPU   RAM  \
0  IPS Panel Retina Display 2560x1600        Intel Core i5 2.3GHz   8GB   
1                            1440x900        Intel Core i5 1.8GHz   8GB   
2                   Full HD 1920x1080  Intel Core i5 7200U 2.5GHz   8GB   
3  IPS Panel Retina Display 2880x1800        Intel Core i7 2.7GHz  16GB   
4  IPS Panel Retina Display 2560x1600        Intel Core i5 3.1GHz   8GB   

               Storage                           GPU Operating System  \
0            128GB SSD  Intel Iris Plus Graphics 640            macOS   
1  128GB Flash Storage        Intel HD Graphics 6000            macOS   
2      

In [5]:
laptops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Manufacturer              1303 non-null   object
 1   Model Name                1303 non-null   object
 2   Category                  1303 non-null   object
 3   Screen Size               1303 non-null   object
 4   Screen                    1303 non-null   object
 5   CPU                       1303 non-null   object
 6   RAM                       1303 non-null   object
 7    Storage                  1303 non-null   object
 8   GPU                       1303 non-null   object
 9   Operating System          1303 non-null   object
 10  Operating System Version  1133 non-null   object
 11  Weight                    1303 non-null   object
 12  Price (Euros)             1303 non-null   object
dtypes: object(13)
memory usage: 132.5+ KB


In [6]:
def column_cleaner(a_string):# Tidies up a column name
    a_string = a_string.strip()
    a_string = a_string.replace('Operating System', 'os')
    a_string = a_string.replace(" ", "_")
    a_string = a_string.replace("(","")
    a_string = a_string.replace(")","")
    a_string = a_string.lower()
    return a_string

new_columns = []
for column in laptops.columns:
    new_col = column_cleaner(column)
    new_columns.append(new_col)

laptops.columns = new_columns
laptops.columns

Index(['manufacturer', 'model_name', 'category', 'screen_size', 'screen',
       'cpu', 'ram', 'storage', 'gpu', 'os', 'os_version', 'weight',
       'price_euros'],
      dtype='object')

In [7]:
laptops['screen_size'].unique()

array(['13.3"', '15.6"', '15.4"', '14.0"', '12.0"', '11.6"', '17.3"',
       '10.1"', '13.5"', '12.5"', '13.0"', '18.4"', '13.9"', '12.3"',
       '17.0"', '15.0"', '14.1"', '11.3"'], dtype=object)

In [8]:
laptops['ram'].unique()

array(['8GB', '16GB', '4GB', '2GB', '12GB', '6GB', '32GB', '24GB', '64GB'],
      dtype=object)

In [9]:
laptops['ram'] = laptops['ram'].str.replace("GB","")
laptops['ram'].unique()

array(['8', '16', '4', '2', '12', '6', '32', '24', '64'], dtype=object)

In [10]:
laptops['ram'] = laptops['ram'].astype(int)
laptops['ram'].dtype

dtype('int32')

In [11]:
laptops.rename({'screen_size':'screen_size_inches'}, axis=1, inplace=True)

In [12]:
laptops.rename({'ram':'ram_gb'}, axis=1, inplace=True)

In [13]:
laptops['ram_gb'].describe()

count    1303.000000
mean        8.382195
std         5.084665
min         2.000000
25%         4.000000
50%         8.000000
75%         8.000000
max        64.000000
Name: ram_gb, dtype: float64

In [14]:
laptops['gpu'].head()

0    Intel Iris Plus Graphics 640
1          Intel HD Graphics 6000
2           Intel HD Graphics 620
3              AMD Radeon Pro 455
4    Intel Iris Plus Graphics 650
Name: gpu, dtype: object

In [15]:
laptops['gpu'] = laptops['gpu'].str.split().str[0]
laptops['gpu'].value_counts()

Intel     722
Nvidia    400
AMD       180
ARM         1
Name: gpu, dtype: int64

In [16]:
laptops['cpu'].head()

0          Intel Core i5 2.3GHz
1          Intel Core i5 1.8GHz
2    Intel Core i5 7200U 2.5GHz
3          Intel Core i7 2.7GHz
4          Intel Core i5 3.1GHz
Name: cpu, dtype: object

In [17]:
laptops['cpu_manufacturer'] = laptops['cpu'].str.split().str[0]
laptops['cpu_manufacturer'].value_counts()

Intel      1240
AMD          62
Samsung       1
Name: cpu_manufacturer, dtype: int64

In [18]:
laptops.columns

Index(['manufacturer', 'model_name', 'category', 'screen_size_inches',
       'screen', 'cpu', 'ram_gb', 'storage', 'gpu', 'os', 'os_version',
       'weight', 'price_euros', 'cpu_manufacturer'],
      dtype='object')

In [19]:
laptops['os'].value_counts()

Windows      1125
No OS          66
Linux          62
Chrome OS      27
macOS          13
Mac OS          8
Android         2
Name: os, dtype: int64

In [20]:
mapping_dict = {
    'Android': 'Android',
    'Chrome OS': 'Chrome OS',
    'Linux': 'Linux',
    'Mac OS': 'macOS',
    'No OS': 'No OS',
    'Windows': 'Windows',
    'macOS': 'macOS'
}

laptops['os'] = laptops['os'].map(mapping_dict)
laptops['os'].value_counts()

Windows      1125
No OS          66
Linux          62
Chrome OS      27
macOS          21
Android         2
Name: os, dtype: int64

In [21]:
laptops.isnull().sum()

manufacturer            0
model_name              0
category                0
screen_size_inches      0
screen                  0
cpu                     0
ram_gb                  0
storage                 0
gpu                     0
os                      0
os_version            170
weight                  0
price_euros             0
cpu_manufacturer        0
dtype: int64

In [22]:
laptops['os_version'].value_counts(dropna=False)

10      1072
NaN      170
7         45
X          8
10 S       8
Name: os_version, dtype: int64

In [23]:
# View the os value counts for rows where the os version is NaN.
laptops.loc[laptops['os_version'].isnull(), 'os'].value_counts()

No OS        66
Linux        62
Chrome OS    27
macOS        13
Android       2
Name: os, dtype: int64

In [24]:
# Assign all macOS os to "X" os version
# Assign "Version Unknown" to "No OS"
laptops.loc[laptops['os'] == 'macOS', 'os_version'] = "X"
laptops.loc[laptops['os'] == 'No OS', 'os_version'] = 'Version Unknown'

In [25]:
laptops['os_version'].value_counts(dropna=False)

10                 1072
NaN                  91
Version Unknown      66
7                    45
X                    21
10 S                  8
Name: os_version, dtype: int64

In [27]:
laptops['weight'] = laptops['weight'].str.replace('kg',"").str.replace("s","").astype(float)
laptops.rename({'weight':'weight_kg'},axis=1,inplace=True)

In [28]:
laptops['weight_kg']

0       1.37
1       1.34
2       1.86
3       1.83
4       1.37
        ... 
1298    1.80
1299    1.30
1300    1.50
1301    2.19
1302    2.20
Name: weight_kg, Length: 1303, dtype: float64

In [29]:
laptops['price_euros'].head()

0    1339,69
1     898,94
2     575,00
3    2537,45
4    1803,60
Name: price_euros, dtype: object

In [30]:
# Remove comma then convert into float
laptops['price_euros'] = laptops['price_euros'].str.replace(",","").astype(int)

In [31]:
laptops['price_euros'].head()

0    133969
1     89894
2     57500
3    253745
4    180360
Name: price_euros, dtype: int32

In [32]:
laptops['screen'].value_counts()

Full HD 1920x1080                                507
1366x768                                         281
IPS Panel Full HD 1920x1080                      230
IPS Panel Full HD / Touchscreen 1920x1080         53
Full HD / Touchscreen 1920x1080                   47
1600x900                                          23
Touchscreen 1366x768                              16
Quad HD+ / Touchscreen 3200x1800                  15
IPS Panel 4K Ultra HD 3840x2160                   12
IPS Panel 4K Ultra HD / Touchscreen 3840x2160     11
4K Ultra HD / Touchscreen 3840x2160               10
IPS Panel 1366x768                                 7
4K Ultra HD 3840x2160                              7
Touchscreen 2560x1440                              7
IPS Panel Retina Display 2304x1440                 6
IPS Panel Retina Display 2560x1600                 6
Touchscreen 2256x1504                              6
IPS Panel Quad HD+ / Touchscreen 3200x1800         6
IPS Panel Touchscreen 2560x1440               

In [41]:
laptops.iloc[1299]

manufacturer                                              Lenovo
model_name                                        Yoga 900-13ISK
category                                      2 in 1 Convertible
screen_size_inches                                         13.3"
screen                IPS Panel Quad HD+ / Touchscreen 3200x1800
cpu                                   Intel Core i7 6500U 2.5GHz
ram_gb                                                        16
storage                                                512GB SSD
gpu                                                        Intel
os                                                       Windows
os_version                                                    10
weight_kg                                                    1.3
price_euros                                               149900
cpu_manufacturer                                           Intel
Name: 1299, dtype: object

In [46]:
laptops['screen_resolutions'] = laptops['screen'].str.extract(r'(\d{3,5}x\d{3,5})')

In [50]:
laptops['screen_resolutions'].value_counts(dropna=False)

1920x1080    841
1366x768     308
3840x2160     43
3200x1800     27
2560x1440     23
1600x900      23
2560x1600      6
2304x1440      6
2256x1504      6
1920x1200      5
2400x1600      4
1440x900       4
2880x1800      4
2160x1440      2
2736x1824      1
Name: screen_resolutions, dtype: int64

In [52]:
laptops['cpu'].head()

0          Intel Core i5 2.3GHz
1          Intel Core i5 1.8GHz
2    Intel Core i5 7200U 2.5GHz
3          Intel Core i7 2.7GHz
4          Intel Core i5 3.1GHz
Name: cpu, dtype: object

In [57]:
laptops['cpu_speed'] = laptops['cpu'].str.extract(r'(\d.\d\D\D\D)')

In [58]:
laptops.head()

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,screen,cpu,ram_gb,storage,gpu,os,os_version,weight_kg,price_euros,cpu_manufacturer,screen_resolutions,cpu_speed
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel,macOS,X,1.37,133969,Intel,2560x1600,2.3GHz
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel,macOS,X,1.34,89894,Intel,1440x900,1.8GHz
2,HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel,No OS,Version Unknown,1.86,57500,Intel,1920x1080,2.5GHz
3,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,512GB SSD,AMD,macOS,X,1.83,253745,Intel,2880x1800,2.7GHz
4,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel,macOS,X,1.37,180360,Intel,2560x1600,3.1GHz
