# Data Cleaning

### ü™° Merge raw_data files into one dataframe

In [1]:
import pandas as pd

raw_data_files = [pd.read_csv('raw_data/cellphoneS.csv'),
                  pd.read_csv('raw_data/hoangHa1.csv'),
                  pd.read_csv('raw_data/hoangHa2.csv'),
                  pd.read_csv('raw_data/thegioididong.csv')]

data = pd.concat(raw_data_files, ignore_index=True)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1622 entries, 0 to 1621
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         1622 non-null   int64 
 1   Name               1622 non-null   object
 2   Screen size        875 non-null    object
 3   Screen type        1427 non-null   object
 4   Rear camera        360 non-null    object
 5   Front camera       359 non-null    object
 6   Chip               1364 non-null   object
 7   RAM                1466 non-null   object
 8   ROM                1454 non-null   object
 9   Batery             1336 non-null   object
 10  SIM                1357 non-null   object
 11  OS                 1375 non-null   object
 12  Screen resolution  1341 non-null   object
 13  Screen feature     188 non-null    object
 14  CPU type           222 non-null    object
 15  Weight             245 non-null    object
 16  Bluetooth          258 non-null    object


### ‚úÇÔ∏è Drop the unnecessary columns

In [2]:
data.drop(['Unnamed: 0', 'Name', 'SIM', 'Weight'], axis=1, inplace=True)

### üö´ Check missing values

In [3]:
data.isnull().sum()

Screen size           747
Screen type           195
Rear camera          1262
Front camera         1263
Chip                  258
RAM                   156
ROM                   168
Batery                286
OS                    247
Screen resolution     281
Screen feature       1434
CPU type             1400
Bluetooth            1364
Refesh rate          1448
Price                   9
Camera                539
Mobile network        676
Screen               1526
dtype: int64

### ‚úÇÔ∏è Remove rows with *'Price'* is *NaN*

In [4]:
data = data[data['Price'].notnull()]

### ‚úÇÔ∏è Remove rows with *'Mobile network'* has *2G*

In [5]:
data = data[data['Mobile network'].str.contains('2G') == False]

### ‚úÇÔ∏è Drop columns with more than 1200 missing values

In [6]:
data.drop(['Rear camera', 'Front camera', 'Screen feature', 'CPU type', 'Bluetooth', 'Refesh rate', 'Screen'], axis=1, inplace=True)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 898 entries, 268 to 1524
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Screen size        401 non-null    object
 1   Screen type        898 non-null    object
 2   Chip               858 non-null    object
 3   RAM                898 non-null    object
 4   ROM                898 non-null    object
 5   Batery             770 non-null    object
 6   OS                 842 non-null    object
 7   Screen resolution  890 non-null    object
 8   Price              898 non-null    object
 9   Camera             890 non-null    object
 10  Mobile network     898 non-null    object
dtypes: object(11)
memory usage: 84.2+ KB


### üßπ ['Screen size']

In [8]:
data['Screen size'].unique()

array(['6.67"', nan, '6.1"', '6.6"', '6.8", T·∫ßn s·ªë qu√©t: 1 - 120 Hz',
       '6.4"', '6.43" - T·∫ßn s·ªë qu√©t 90 Hz', '6.67" - T·∫ßn s·ªë qu√©t 120 Hz',
       '6.5" - T·∫ßn s·ªë qu√©t 60 Hz', '5.4"', '6.8"', '6.43"', '6.5"',
       '6.67‚Äù', '6.52 inches, 102.6 cm2 (~81.2% screen-to-body ratio)',
       "Hole punch screen 6.43''", '6.56" - T·∫ßn s·ªë qu√©t 90 Hz', "6.4'",
       '4.7" - T·∫ßn s·ªë qu√©t H√£ng kh√¥ng c√¥ng b·ªë', '1.77"', '6,6 inch',
       '6.28 inches', '2.4"', '6.6" - T·∫ßn s·ªë qu√©t 60 Hz',
       '6.4" - T·∫ßn s·ªë qu√©t 90 Hz', '6.5‚Äù', '6.53"', '5"'], dtype=object)

In [9]:
import re

def filter_screen_size(string):
    string = str(string)
    string = string.strip()

    # if string is empty, return NaN
    if string == '':
        return None

    # replace all commas with dots
    string = string.replace(',', '.')

    # remove all inch characters
    string = string.replace('"', '')

    # find all string with format 'x.x'
    matched_values = re.findall('\d+\.\d+', string)

    # convert the first matched value to float
    if len(matched_values) > 0:
        return float(matched_values[0])

    return None

data['Screen size'] = data['Screen size'].apply(filter_screen_size)

data['Screen size'].unique()

array([6.67,  nan, 6.1 , 6.6 , 6.8 , 6.4 , 6.43, 6.5 , 5.4 , 6.52, 6.56,
       4.7 , 1.77, 6.28, 2.4 , 6.53])

### üßπ ['Screen type']

In [10]:
data['Screen type'].unique()

array(['AMOLED', 'Dynamic AMOLED 2X',
       'Dynamic AMOLED 2X, 10 - 120 Hz, Infinity O', 'Super AMOLED',
       'OLED', 'IPS LCD', 'PLS LCD', 'Thi·∫øt k·∫ø hi·ªán ƒë·∫°i m√†n h√¨nh l·ªó',
       '720 x 1600 pixels, 20:9 ratio (~269 ppi density)',
       'QQVGA, M√†u m√†n h√¨nh: 262K', 'IPS LCD, 90Hz',
       'AMOLED, 1 t·ª∑ m√†u, HDR10+, 120Hz, t·ª∑ l·ªá 20:9, 6.28 inches, Full HD+ (1080 x 2400 pixels), Corning Gorilla Glass Victus, C·∫£m bi·∫øn v√¢n tay trong m√†n h√¨nh, C·∫£m ·ª©ng ƒëi·ªán dung ƒëa ƒëi·ªÉm',
       'LCD', 'ƒêang c·∫≠p nh·∫≠t', 'TFT LCD',
       '20:9 m√†n h√¨nh gi·ªçt n∆∞·ªõc | T·ªëc ƒë·ªô l√†m t∆∞∆°i m√†n h√¨nh 90Hz'],
      dtype=object)

In [11]:
def filter_screen_type(string):
    string = str(string)
    
    # get all types of screen
    screen_types = ['OLED', 'LCD']
    
    if string in screen_types:
        return string

    # convert string to one of the screen types
    for screen_type in screen_types:
        if screen_type in string:
            return screen_type

    return None

data['Screen type'] = data['Screen type'].apply(filter_screen_type)

data['Screen type'].value_counts()

OLED    481
LCD     345
Name: Screen type, dtype: int64

### üßπ ['Chip']

In [12]:
data['Chip'].unique()

array(['Snapdragon 732G 8 nh√¢n', 'Snapdragon 888 8 nh√¢n',
       'Snapdragon¬Æ 8 Gen 1 (4nm)', 'MediaTek Helio G80 8 nh√¢n',
       'Apple A15 Bionic 6 nh√¢n', 'MediaTek MT6765 8 nh√¢n',
       'Snapdragon 680 8 nh√¢n', 'MediaTek Helio G96 8 nh√¢n',
       'Snapdragon 695 5G 8 nh√¢n', 'Apple A13 Bionic 6 nh√¢n',
       'Apple A14 Bionic 6 nh√¢n', 'MediaTek Helio G35 8 nh√¢n',
       'Snapdragon 662 8 nh√¢n', nan, 'Exynos 2100 8 nh√¢n',
       'MediaTek Helio G95 8 nh√¢n', 'Snapdragon 765G 8 nh√¢n',
       'Snapdragon 439 8 nh√¢n', 'MediaTek Helio G95',
       'MediaTek Dimensity 700 8 nh√¢n 5G',
       'MediaTek Dimensity 800U 5G 8 nh√¢n', 'Spreadtrum T610 8 nh√¢n',
       'MediaTek Helio G88 8 nh√¢n', 'Spreadtrum SC9863A 8 nh√¢n',
       'Snapdragon 778G 5G 8 nh√¢n', 'MediaTek Helio P35 8 nh√¢n',
       'Unisoc T606 8 nh√¢n', 'MediaTek Dimensity 810 5G 8 nh√¢n',
       'Unisoc T618 8 nh√¢n', 'Mediatek Helio G85, Antutu 205,950',
       'MediaTek MT6762D Helio A25 (12 nm)',
       'M

In [13]:
def filter_chip(string):
    string = str(string)

    # find all string with format 'Ax'
    chip_apple_values = re.findall('A\d+', string)

    # convert the first matched value to 'Apple'
    if len(chip_apple_values) > 0:
        string = 'Apple'

    # convert all to lower case
    string = string.lower()

    # get all types of chip
    chip_types = ['snapdragon', 'apple', 'mediatek', 'exynos']

    if string in chip_types:
        return string

    # convert string to one of the chip types
    for chip_type in chip_types:
        if chip_type in string:
            return chip_type

    return None

data['Chip'] = data['Chip'].apply(filter_chip)

data['Chip'].value_counts()

mediatek      249
snapdragon    225
apple         208
exynos         24
Name: Chip, dtype: int64

### üßπ ['RAM'] - ['ROM']

In [14]:
data['RAM'].unique()

array(['8 GB', '8GB', '12GB', '12 GB', '6 GB', '4 GB', '3 GB', '2 GB',
       '4GB', '48MB', '4 GB ~ 7GB ( Bao g·ªìm c·∫£ RAM ·∫£o)', '64 MB', '48 MB',
       '16 MB', '6GB', 'Kh√¥ng', 'ƒêang c·∫≠p nh·∫≠t'], dtype=object)

In [15]:
data['ROM'].unique()

array(['128 GB', '128GB', '256GB', '512GB', '256 GB', '512 GB', '1 TB',
       '32 GB', '64 GB', '64GB', '128MB', '128 MB', '24 MB', 'Kh√¥ng',
       '4 GB', '16 GB'], dtype=object)

In [16]:
def filter_ram_rom(string):
    string = str(string)
    
    # remove all spaces
    string = string.replace(' ', '')

    # find all string with format 'xGB' or 'xMB' or 'xTB
    regex = re.compile('\d+GB|\d+MB|\d+TB')
    number = regex.findall(string)

    # convert TB, GB to MB and convert to int
    if len(number) > 0:
        if 'TB' in number[0]:
            return int(number[0].replace('TB', '')) * 1024 * 1024
        elif 'GB' in number[0]:
            return int(number[0].replace('GB', '')) * 1024
        elif 'MB' in number[0]:
            return int(number[0].replace('MB', ''))

    return None

data['RAM'] = data['RAM'].apply(filter_ram_rom)
data['ROM'] = data['ROM'].apply(filter_ram_rom)

In [17]:
data['RAM'].unique()

array([ 8192., 12288.,  6144.,  4096.,  3072.,  2048.,    48.,    64.,
          16.,    nan])

In [18]:
data['ROM'].unique()

array([1.310720e+05, 2.621440e+05, 5.242880e+05, 1.048576e+06,
       3.276800e+04, 6.553600e+04, 1.280000e+02, 2.400000e+01,
                nan, 4.096000e+03, 1.638400e+04])

### üßπ ['Battery']

In [19]:
data['Batery'].unique()

array(['5020 mAh', '3300 mAh',
       '3700 mAh, H·ªó tr·ª£ S·∫°c t·ªëi ƒëa: 25W, S·∫°c k√®m m√°y: Kh√¥ng c√≥',
       '3700 mAh',
       '4500 mAh, C√¥ng su·∫•t h·ªó tr·ª£: 45W, S·∫°c k√®m m√°y: Kh√¥ng c√≥',
       '5000 mAh', '4400 mAh', nan, '4230 mAh', '3110 mAh', '2815 mAh',
       '2227 mAh', '2950 mAh', '4300 mAh, S·∫°c si√™u nhanh VOOC2.0 65W',
       '4000 mAh', '4500 mAh', '4310 mAh', '6000 mAh', '4050 mAh',
       '5000mAh', '5000mAH', '4200 mAh', '4250 mAh', '4500 mAh - 65W',
       '1000mAh', '5.000 mAh',
       'Li-Po 4500mAh, S·∫°c nhanh 67W, 100% trong 39ph (qu·∫£ng c√°o), S·∫°c nhanh kh√¥ng d√¢y 50W, 100% trong 53ph (qu·∫£ng c√°o), S·∫°c ng∆∞·ª£c kh√¥ng d√¢y 10W, Power Delivery 3.0, Quick Charge 4+',
       '1900 mAh', '5050 mAh', '1000 mAh', '2000 mAh',
       '5000 mAh, 25 W, S·∫°c pin nhanh',
       '5050 mAh4, S·∫°c T∆∞∆°ng thiÃÅch s·∫°c nhanh 18W chu·∫©n USB PD 3.0 (b·ªô s·∫°c ƒëi k√®m trong h·ªôp m√°y 10W)',
       '1150 mAh'], dtype=object)

In [20]:
def filter_battery(string):
    string = str(string)
    
    # remove all spaces
    string = string.replace(' ', '')

    # remove all commas
    string = string.replace(',', '')

    # remove all dots
    string = string.replace('.', '')

    # find all string with format 'xmAh'
    regex = re.compile('\d+mAh')
    number = regex.findall(string)

    # get the first number and convert to int
    if len(number) > 0:
        return int(number[0].replace('mAh', ''))
    
    return None

data['Batery'] = data['Batery'].apply(filter_battery)

data['Batery'].unique()

array([5020., 3300., 3700., 4500., 5000., 4400.,   nan, 4230., 3110.,
       2815., 2227., 2950., 4300., 4000., 4310., 6000., 4050., 4200.,
       4250., 1000., 1900., 5050., 2000., 1150.])

### üßπ ['OS']

In [21]:
data['OS'].unique()

array(['Android 11', 'Android 12', 'iOS 15', 'iOS 14',
       'Android 11 (Go Edition)', nan, 'BOS - based on Android 11',
       'Android 10 (Go edition), HIOS 6.2', 'ColorOS 12', 'IOS 15',
       'Kh√¥ng c√≥', 'Android 10'], dtype=object)

In [22]:
def filter_os(string):
    string = str(string)

    # covert all to lower case
    string = string.lower()
    
    # get all types of os
    os_types = ['android', 'ios']
    
    if string in os_types:
        return string

    for os_type in os_types:
        if os_type in string:
            return os_type

    return None

data['OS'] = data['OS'].apply(filter_os)

data['OS'].value_counts()

android    618
ios        192
Name: OS, dtype: int64

### üßπ ['Screen resolution']

In [23]:
data['Screen resolution'].unique()

array(['Full HD+ (1080 x 2400 Pixels)', 'Full HD+ (1080 x 2640 Pixels)',
       '2340 x 1080', '3088 x 1440', 'Full HD+ (1768 x 2208 Pixels)',
       '1170 x 2532 Pixels', 'HD+ (720 x 1600 Pixels)',
       '1284 x 2778 Pixels', '828 x 1792 Pixels',
       'HD+ (700 x 1600 Pixels)', 'Full HD+ (1080 x 2340 Pixels)',
       '2K+ (1440 x 3200 Pixels)', 'Full HD+ (1080 x 2408 Pixel)',
       'ƒêang c√¢Ã£p nh√¢Ã£t', 'FHD', 'Full HD+ (1080 x 2408 Pixels)', nan,
       '6,44" FHD+', 'Full HD+ (1080 x 2412 Pixels)',
       '2400 x 1080 FHD+, 90Hz AMOLED', 'HD+ (720 x 1612 Pixels)',
       '2400X1080', 'HD (750 x 1334 Pixels)', '128*160',
       '720 x 1612 pixel', '1080 x 2400 pixels',
       'QVGA (240 x 320 Pixels)', '128 x 160 Pixels', '176 x 220 Pixels',
       'HD+', 'HD (720 x 1280 Pixels)'], dtype=object)

In [24]:
def filter_screen_resolution(string):
    string = str(string)

    # remove all spaces
    string = string.replace(' ', '')

    # find all string with format 'axb'
    regex = re.compile('\d+x\d+')
    number = regex.findall(string)

    # multiply the first number by the second number and convert to int
    if len(number) > 0:
        return int(number[0].split('x')[0]) * int(number[0].split('x')[1])

    return None

data['Screen resolution'] = data['Screen resolution'].apply(filter_screen_resolution)

data['Screen resolution'].unique()

array([2592000., 2851200., 2527200., 4446720., 3903744., 2962440.,
       1152000., 3566952., 1483776., 1120000., 4608000., 2600640.,
            nan, 2604960., 1160640., 1000500.,   76800.,   20480.,
         38720.,  921600.])

### üßπ ['Camera']

In [25]:
data['Camera'].unique()

array(['Ch√≠nh 108 MP & Ph·ª• 8 MP, 5 MP, 2 MP, 16 MP',
       '2 camera 12 MP, 10 MP',
       '12MP (UW) + 50MP (W) + 10MP (Tele), 10MP',
       '12MP (UW) + 108MP (W) + 12MP (Tele3x) + 12MP (Tele10x), 40MP',
       '3 camera 12 MP, 10 MP & 4 MP',
       'Ch√≠nh 64 MP & Ph·ª• 8 MP, 5MP, 5MP, 20 MP', '2 camera 12 MP, 12 MP',
       '3 camera 12 MP, 12 MP', '13 MP, 13 MP',
       'Ch√≠nh 50  MP & Ph·ª• 8 MP, 2 MP, 2 MP, 13 MP',
       'Ch√≠nh 108 MP & Ph·ª• 8 MP, 2 MP, 2 MP, 16 MP',
       'Ch√≠nh 108 MP & Ph·ª• 8 MP, 2 MP, 16 MP',
       'Ch√≠nh 50 MP & Ph·ª• 2 MP, 2 MP, 16 MP',
       'Ch√≠nh 48 MP & Ph·ª• 2 MP, 2 MP, 16 MP', '5 MP, 5 MP',
       'Ch√≠nh 108 MP & Ph·ª• 12 MP, 10 MP, 10 MP, 40 MP',
       'Ch√≠nh 64 MP & Ph·ª• 8 MP, 2 MP, 2 MP, 13 MP',
       'Ch√≠nh 64 MP & Ph·ª• 8 MP, 2 MP, 2 MP, 32 MP',
       'Ch√≠nh 13 MP & Ph·ª• 2 MP, 8 MP',
       'Ch√≠nh 64 MP & Ph·ª• 8 MP, 2 MP, 2 MP, 16 MP',
       'Ch√≠nh 64 MP & Ph·ª• 8 MP, 2 MP, 16 MP',
       'Ch√≠nh 64 MP & Ph·ª• 8 MP, 2

In [26]:
def filter_camera_count(string):
    string = str(string)

    # count the number of 'MP' in the string
    return string.count('MP')

data['Camera count'] = data['Camera'].apply(filter_camera_count)

data['Camera count'].unique()

array([5, 2, 4, 3, 1, 0], dtype=int64)

In [27]:
def filter_camera_max_mp(string):
    string = str(string)

    # remove all spaces
    string = string.replace(' ', '')

    # find all string with format 'xMP'
    regex = re.compile('\d+MP')
    matched_values = regex.findall(string)

    # remove 'MP' from matched values, convert to int and return the max number
    if len(matched_values) > 0:
        return max(map(lambda x: int(x.replace('MP', '')), matched_values))

    return None

data['Camera max MP'] = data['Camera'].apply(filter_camera_max_mp)

data['Camera max MP'].unique()

array([108.,  12.,  50.,  64.,  13.,  48.,   5.,  32.,  16.,  nan,   8.,
         3.])

In [28]:
# drop the column 'Camera'
data.drop(columns=['Camera'], inplace=True)

### üßπ ['Mobile network']

In [29]:
data['Mobile network'].unique()

array(['H·ªó tr·ª£ 4G', 'H·ªó tr·ª£ 5G', '5G', '4GB', '4G',
       '"GSM: 900/1800MHz  WCMA: 900/2100MHz, 4G-LTE: B1/3/7/ 8/20"',
       '1, 2, 3, 4, 5, 7, 8, 12, 17, 18, 19, 26, 34, 38, 39, 40, 41, 42',
       'H·ªó tr·ª£ 4G VoLTE',
       'T·ªëc ƒë·ªô m·∫°ng, GSM: 850, 900, 1800, 1900, WCDMA: 1, 5, 8, LTE FDD: 1, 3, 5, 7, 8, 20, 28, 38, 40, 41 (full), LTE CAT4'],
      dtype=object)

In [30]:
def filter_mobile_network(string):
    string = str(string)

    # find all string with format 'xG'
    regex = re.compile('\d+G')
    matched_values = regex.findall(string)

    # return the first matched value and remove 'G'
    if len(matched_values) > 0:
        return int(matched_values[0].replace('G', ''))

    return None

data['Mobile network'] = data['Mobile network'].apply(filter_mobile_network)

data['Mobile network'].value_counts()

4.0    466
5.0    416
Name: Mobile network, dtype: int64

### üßπ ['Price']

In [31]:
data['Price'].unique()

array(['5,990,000 ‚Ç´', '20,990,000 ‚Ç´', '16,990,000 ‚Ç´', '18,490,000 ‚Ç´',
       '22,490,000 ‚Ç´', '25,990,000 ‚Ç´', '28,990,000 ‚Ç´', '31,990,000 ‚Ç´',
       '18,690,000 ‚Ç´', '32,890,000 ‚Ç´', '35,790,000 ‚Ç´', '20,590,000 ‚Ç´',
       '5,550,000 ‚Ç´', '20,490,000 ‚Ç´', '23,690,000 ‚Ç´', '25,980,000 ‚Ç´',
       '26,750,000 ‚Ç´', '29,790,000 ‚Ç´', '33,890,000 ‚Ç´', '39,490,000 ‚Ç´',
       '3,150,000 ‚Ç´', '4,490,000 ‚Ç´', '5,790,000 ‚Ç´', '7,790,000 ‚Ç´',
       '6,750,000 ‚Ç´', '28,950,000 ‚Ç´', '31,750,000 ‚Ç´', '36,990,000 ‚Ç´',
       '42,890,000 ‚Ç´', '11,390,000 ‚Ç´', '13,090,000 ‚Ç´', '17,590,000 ‚Ç´',
       '22,390,000 ‚Ç´', '15,890,000 ‚Ç´', '4,150,000 ‚Ç´', '14,990,000 ‚Ç´',
       '15,990,000 ‚Ç´', '19,950,000 ‚Ç´', '1,980,000 ‚Ç´', '19,590,000 ‚Ç´',
       '5,090,000 ‚Ç´', '18,640,000 ‚Ç´', '8,990,000 ‚Ç´', '2,790,000 ‚Ç´',
       '5,890,000 ‚Ç´', '5,290,000 ‚Ç´', '7,990,000 ‚Ç´', '18,750,000 ‚Ç´',
       '6,590,000 ‚Ç´', '3,590,000 ‚Ç´', '3,890,000 ‚Ç´', '2,680,00

In [32]:
def filter_price(string):
    string = str(string)
    string = string.strip()

    string = string.replace('.', '')
    string = string.replace(',', '')
    string = string.replace('000\xa0‚Ç´', '')
    string = string.replace('000‚Ç´', '')
    string = string.replace('000 ‚Ç´', '')
    string = string.replace('000 *', '')
    string = string.replace(' *', '')

    return int(string)

data['Price'] = data['Price'].apply(filter_price)

data['Price'].unique()

array([ 5990, 20990, 16990, 18490, 22490, 25990, 28990, 31990, 18690,
       32890, 35790, 20590,  5550, 20490, 23690, 25980, 26750, 29790,
       33890, 39490,  3150,  4490,  5790,  7790,  6750, 28950, 31750,
       36990, 42890, 11390, 13090, 17590, 22390, 15890,  4150, 14990,
       15990, 19950,  1980, 19590,  5090, 18640,  8990,  2790,  5890,
        5290,  7990, 18750,  6590,  3590,  3890,  2680,  7690, 21490,
       13390,  3750,  3490,  4650,  2950,  6990,  2650,  5190,  3950,
        2280,  3790,  2490, 10990,  5350,  7490,  8890, 12490, 14490,
         450,  2690, 15590,   650,  2890,   470,   690,  3990,  6850,
        8190,  3580,  9890,  8590, 11990, 26790,   850,  2450,  2090,
        2640,  3090,   890,   790], dtype=int64)

### üîñ Data after converting to the correct data type

In [33]:
data

Unnamed: 0,Screen size,Screen type,Chip,RAM,ROM,Batery,OS,Screen resolution,Price,Mobile network,Camera count,Camera max MP
268,6.67,OLED,snapdragon,8192.0,131072.0,5020.0,android,2592000.0,5990,4.0,5,108.0
269,,OLED,snapdragon,8192.0,131072.0,3300.0,android,2851200.0,20990,5.0,2,12.0
270,6.10,OLED,snapdragon,8192.0,131072.0,3700.0,android,2527200.0,16990,5.0,4,50.0
271,6.10,OLED,snapdragon,8192.0,262144.0,3700.0,android,2527200.0,18490,5.0,4,50.0
272,6.60,OLED,snapdragon,8192.0,262144.0,4500.0,android,2527200.0,20990,5.0,4,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1516,6.53,LCD,mediatek,2048.0,32768.0,5000.0,android,1120000.0,2090,4.0,2,13.0
1517,6.53,LCD,mediatek,3072.0,65536.0,5000.0,android,1152000.0,2640,4.0,4,13.0
1518,6.53,LCD,mediatek,4096.0,131072.0,5000.0,android,1152000.0,2890,4.0,4,13.0
1523,2.40,LCD,,,4096.0,1900.0,,76800.0,890,4.0,1,3.0


### üö´ Fill missing values

In [34]:
data.isnull().sum()

Screen size          505
Screen type           72
Chip                 192
RAM                   16
ROM                    8
Batery               136
OS                    88
Screen resolution     72
Price                  0
Mobile network        16
Camera count           0
Camera max MP         32
dtype: int64

##### Mean inputation

In [35]:
def mean_imputation(data, column):
  data[column] = data[column].fillna(data[column].mean())
  return data

mean_imputation(data, 'Screen size')
mean_imputation(data, 'RAM')
mean_imputation(data, 'ROM')
mean_imputation(data, 'Batery')
mean_imputation(data, 'Screen resolution')
mean_imputation(data, 'Camera max MP')
mean_imputation(data, 'Mobile network')

Unnamed: 0,Screen size,Screen type,Chip,RAM,ROM,Batery,OS,Screen resolution,Price,Mobile network,Camera count,Camera max MP
268,6.670000,OLED,snapdragon,8192.000000,131072.0,5020.0,android,2592000.0,5990,4.0,5,108.0
269,6.001272,OLED,snapdragon,8192.000000,131072.0,3300.0,android,2851200.0,20990,5.0,2,12.0
270,6.100000,OLED,snapdragon,8192.000000,131072.0,3700.0,android,2527200.0,16990,5.0,4,50.0
271,6.100000,OLED,snapdragon,8192.000000,262144.0,3700.0,android,2527200.0,18490,5.0,4,50.0
272,6.600000,OLED,snapdragon,8192.000000,262144.0,4500.0,android,2527200.0,20990,5.0,4,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1516,6.530000,LCD,mediatek,2048.000000,32768.0,5000.0,android,1120000.0,2090,4.0,2,13.0
1517,6.530000,LCD,mediatek,3072.000000,65536.0,5000.0,android,1152000.0,2640,4.0,4,13.0
1518,6.530000,LCD,mediatek,4096.000000,131072.0,5000.0,android,1152000.0,2890,4.0,4,13.0
1523,2.400000,LCD,,5681.197279,4096.0,1900.0,,76800.0,890,4.0,1,3.0


##### Arbitrary imputation 

In [36]:
import numpy as np

def arbitrary_imputation(data, column, arbitrary_list):
    data[column] = data[column].fillna(arbitrary_list[np.random.randint(0, len(arbitrary_list))])
    return data

screen_type_list = data['Screen type'].dropna().unique().tolist()
arbitrary_imputation(data, 'Screen type', screen_type_list)

os_list = data['OS'].dropna().unique().tolist()
arbitrary_imputation(data, 'OS', os_list)

Unnamed: 0,Screen size,Screen type,Chip,RAM,ROM,Batery,OS,Screen resolution,Price,Mobile network,Camera count,Camera max MP
268,6.670000,OLED,snapdragon,8192.000000,131072.0,5020.0,android,2592000.0,5990,4.0,5,108.0
269,6.001272,OLED,snapdragon,8192.000000,131072.0,3300.0,android,2851200.0,20990,5.0,2,12.0
270,6.100000,OLED,snapdragon,8192.000000,131072.0,3700.0,android,2527200.0,16990,5.0,4,50.0
271,6.100000,OLED,snapdragon,8192.000000,262144.0,3700.0,android,2527200.0,18490,5.0,4,50.0
272,6.600000,OLED,snapdragon,8192.000000,262144.0,4500.0,android,2527200.0,20990,5.0,4,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1516,6.530000,LCD,mediatek,2048.000000,32768.0,5000.0,android,1120000.0,2090,4.0,2,13.0
1517,6.530000,LCD,mediatek,3072.000000,65536.0,5000.0,android,1152000.0,2640,4.0,4,13.0
1518,6.530000,LCD,mediatek,4096.000000,131072.0,5000.0,android,1152000.0,2890,4.0,4,13.0
1523,2.400000,LCD,,5681.197279,4096.0,1900.0,ios,76800.0,890,4.0,1,3.0


##### If OS is 'ios', then Chip is 'apple'

In [37]:
def chip_with_os_ios(data):
    data.loc[data['OS'] == 'ios', 'Chip'] = 'apple'

chip_with_os_ios(data)

chip_list = data['Chip'].dropna().unique().tolist()
chip_list = list(filter(lambda x: x != 'apple', chip_list))
arbitrary_imputation(data, 'Chip', chip_list)

Unnamed: 0,Screen size,Screen type,Chip,RAM,ROM,Batery,OS,Screen resolution,Price,Mobile network,Camera count,Camera max MP
268,6.670000,OLED,snapdragon,8192.000000,131072.0,5020.0,android,2592000.0,5990,4.0,5,108.0
269,6.001272,OLED,snapdragon,8192.000000,131072.0,3300.0,android,2851200.0,20990,5.0,2,12.0
270,6.100000,OLED,snapdragon,8192.000000,131072.0,3700.0,android,2527200.0,16990,5.0,4,50.0
271,6.100000,OLED,snapdragon,8192.000000,262144.0,3700.0,android,2527200.0,18490,5.0,4,50.0
272,6.600000,OLED,snapdragon,8192.000000,262144.0,4500.0,android,2527200.0,20990,5.0,4,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1516,6.530000,LCD,mediatek,2048.000000,32768.0,5000.0,android,1120000.0,2090,4.0,2,13.0
1517,6.530000,LCD,mediatek,3072.000000,65536.0,5000.0,android,1152000.0,2640,4.0,4,13.0
1518,6.530000,LCD,mediatek,4096.000000,131072.0,5000.0,android,1152000.0,2890,4.0,4,13.0
1523,2.400000,LCD,apple,5681.197279,4096.0,1900.0,ios,76800.0,890,4.0,1,3.0


### üìù Export data to csv

In [38]:
data.to_csv('clean_data/clean_data.csv', index=False)