In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

In [2]:
pd.options.mode.chained_assignment = None

In [3]:
df = pd.read_json("scout_car.json", lines = True)

In [4]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_info_columns', 500)
pd.set_option('display.max_info_rows', 500)
pd.set_option('display.expand_frame_repr', True)
pd.set_option('display.width', 2000)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15919 entries, 0 to 15918
Data columns (total 54 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   url                            object 
 1   make_model                     object 
 2   short_description              object 
 3   body_type                      object 
 4   price                          int64  
 5   vat                            object 
 6   km                             object 
 7   registration                   object 
 8   prev_owner                     object 
 9   kW                             float64
 10  hp                             object 
 11  Type                           object 
 12  Previous Owners                object 
 13  Next Inspection                object 
 14  Inspection new                 object 
 15  Warranty                       object 
 16  Full Service                   object 
 17  Non-smoking Vehicle            object 
 18  null  

In [6]:
missing_vals = (df.isnull().sum()/df.shape[0])*100
missing_vals

url                                0.000000
make_model                         0.000000
short_description                  0.288963
body_type                          0.376908
price                              0.000000
vat                               28.349771
km                                 0.000000
registration                       0.000000
prev_owner                        42.892141
kW                               100.000000
hp                                 0.000000
Type                               0.012564
Previous Owners                   41.711163
Next Inspection                   77.793831
Inspection new                    75.299956
Warranty                          34.047365
Full Service                      48.395000
Non-smoking Vehicle               54.915510
null                               0.000000
Make                               0.000000
Model                              0.000000
Offer Number                      19.944720
First Registration              

In [7]:
def show_nan(dataframe, limit):
    miss = (df.isnull().sum()/df.shape[0])*100
    return miss[miss>limit]

In [8]:
show_nan(df, 35)

prev_owner                        42.892141
kW                               100.000000
Previous Owners                   41.711163
Next Inspection                   77.793831
Inspection new                    75.299956
Full Service                      48.395000
Non-smoking Vehicle               54.915510
Paint Type                        36.258559
Model Code                        68.729192
Cylinders                         35.680633
Weight                            43.809285
Drive chain                       43.080596
Emission Label                    74.967021
Country version                   52.346253
Electricity consumption           99.139393
Last Service Date                 96.444500
Other Fuel Types                  94.472015
Availability                      96.011056
Last Timing Belt Service Date     99.899491
Available from                    98.291350
dtype: float64

In [9]:
df.columns

Index(['url', 'make_model', 'short_description', 'body_type', 'price', 'vat', 'km', 'registration', 'prev_owner', 'kW', 'hp', 'Type', 'Previous Owners', 'Next Inspection', 'Inspection new', 'Warranty', 'Full Service', 'Non-smoking Vehicle', 'null', 'Make', 'Model', 'Offer Number', 'First Registration', 'Body Color', 'Paint Type', 'Body Color Original', 'Upholstery', 'Body', 'Nr. of Doors', 'Nr. of Seats', 'Model Code', 'Gearing Type', 'Displacement', 'Cylinders', 'Weight', 'Drive chain', 'Fuel', 'Consumption', 'CO2 Emission', 'Emission Class', '\nComfort & Convenience\n', '\nEntertainment & Media\n', '\nExtras\n', '\nSafety & Security\n', 'description', 'Emission Label', 'Gears', 'Country version', 'Electricity consumption', 'Last Service Date', 'Other Fuel Types', 'Availability', 'Last Timing Belt Service Date', 'Available from'], dtype='object')

In [10]:
col_nm = ['url',
 'make_model',
 'short_description',
 'body_type',
 'price',
 'vat',
 'km',
 'registration',
 'prev_owner',
 'kw',
 'hp',
 'type',
 'previous_owners',
 'next_inspection',
 'inspection_new',
 'warranty',
 'full_service',
 'non-smoking_vehicle',
 'null',
 'make',
 'model',
 'offer_number',
 'first_registration',
 'body_color',
 'paint_type',
 'body_color_original',
 'upholstery',
 'body',
 'nr_of_doors',
 'nr_of_seats',
 'model_code',
 'gearing_type',
 'displacement',
 'cylinders',
 'weight',
 'drive_chain',
 'fuel',
 'consumption',
 'co2_emission',
 'emission_class',
 'comfort&convenience',
 'entertainment&media',
 'extras',
 'safety&security',
 'description',
 'emission_label',
 'gears',
 'country_version',
 'electricity_consumption',
 'last_service_date',
 'other_fuel_types',
 'availability',
 'last_timing_belt_service_date',
 'available_from']

In [11]:
len(list(df.columns))

54

In [12]:
df.set_axis(col_nm, axis = 1, inplace = True)

In [13]:
df.columns

Index(['url', 'make_model', 'short_description', 'body_type', 'price', 'vat', 'km', 'registration', 'prev_owner', 'kw', 'hp', 'type', 'previous_owners', 'next_inspection', 'inspection_new', 'warranty', 'full_service', 'non-smoking_vehicle', 'null', 'make', 'model', 'offer_number', 'first_registration', 'body_color', 'paint_type', 'body_color_original', 'upholstery', 'body', 'nr_of_doors', 'nr_of_seats', 'model_code', 'gearing_type', 'displacement', 'cylinders', 'weight', 'drive_chain', 'fuel', 'consumption', 'co2_emission', 'emission_class', 'comfort&convenience', 'entertainment&media', 'extras', 'safety&security', 'description', 'emission_label', 'gears', 'country_version', 'electricity_consumption', 'last_service_date', 'other_fuel_types', 'availability', 'last_timing_belt_service_date', 'available_from'], dtype='object')

In [14]:
drop_cols = ['availability', 'available_from', 'electricity_consumption', 'last_service_date', 
             'last_timing_belt_service_date', 'other_fuel_types', 'kw']

In [15]:
df.drop(drop_cols, axis = 1, inplace = True)

In [16]:
def fill_most(df, group_col, col_name):
    for group in list(df[group_col].unique()):
        grp_inx = list(df[df[group_col] == group][col_name].index)
        df[col_name].iloc[grp_inx] = df[col_name].iloc[grp_inx].fillna(df[df[group_col] == group][col_name].mode()[0])

In [17]:
def list_extractor(lst):
    return ','.join([i.strip() if type(lst)==list else i.strip() for i in lst])

In [18]:
df["km"] = df["km"].str.replace("km", "")

In [19]:
df["km"] = df["km"].str.replace(",", ".")

In [20]:
df["km"] = df["km"].where(df["km"] != "- ")

In [21]:
df["km"] = df["km"].astype(float)

In [22]:
pd.pandas.set_option('display.max_columns', None)

In [23]:
df.drop("url", axis = 1, inplace = True)

In [24]:
def name_splitter(name):
    return name.split()[0] if len(name.split())>1 else name

In [25]:
df["make"] = df["make_model"].apply(name_splitter)

In [26]:
df.drop("null", axis = 1, inplace = True)

In [27]:
def model_extractor(model_list):
    try:
        return model_list[1]
    except TypeError:
        return model_list

In [28]:
df["model"] = df["model"].apply(model_extractor)

In [30]:
df["body_color"] = df["body_color"].apply(model_extractor)

In [31]:
df.previous_owners.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


\n1\n                                                                                        8101
NaN                                                                                          6640
\n2\n                                                                                         766
\n0\n                                                                                         163
\n3\n                                                                                          17
                                                                                             ... 
[\n1\n, \n96 g CO2/km (comb)\n]                                                                 1
[\n1\n, \n181 g CO2/km (comb)\n]                                                                1
[\n1\n, \nEuro 6\n]                                                                             1
[\n1\n, \n, 6.7 l/100 km (comb), \n, 8.6 l/100 km (city), \n, 5.6 l/100 km (country), \n]       1
[\n1\n, \n102 g CO2/

In [32]:
def prev_ext(item):
    return item[0].strip("\n") if type(item) == list else item[1]

In [33]:
df.prev_owner.value_counts(dropna = False)

1 previous owner     8294
NaN                  6828
2 previous owners     778
3 previous owners      17
4 previous owners       2
Name: prev_owner, dtype: int64

In [34]:
def owner_extractor(data):
    return data[0][1] if type(data) == list else data[1]

In [35]:
df["previous_owners"] = df.previous_owners[df.previous_owners.notnull()].apply(owner_extractor)

In [36]:
df.previous_owners.value_counts(dropna = False)

1      8294
NaN    6640
2       778
0       188
3        17
4         2
Name: previous_owners, dtype: int64

In [37]:
df["previous_owners"] = df["previous_owners"].astype('float')

In [38]:
df.previous_owners.value_counts(dropna = False)

1.0    8294
NaN    6640
2.0     778
0.0     188
3.0      17
4.0       2
Name: previous_owners, dtype: int64

In [39]:
df.upholstery.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\nCloth, Black\n]           5821
NaN                          3720
[\nPart leather, Black\n]    1121
[\nCloth\n]                  1005
[\nCloth, Grey\n]             891
[\nCloth, Other\n]            639
[\nFull leather, Black\n]     575
[\nBlack\n]                   491
[\nGrey\n]                    273
[\nOther, Other\n]            182
[\nPart leather\n]            140
[\nFull leather\n]            139
[\nFull leather, Brown\n]     116
[\nPart leather, Grey\n]      116
[\nOther, Black\n]            110
[\nFull leather, Other\n]      72
[\nFull leather, Grey\n]       67
[\nPart leather, Other\n]      65
[\nOther\n]                    56
[\nPart leather, Brown\n]      50
[\nalcantara, Black\n]         47
[\nVelour, Black\n]            36
[\nFull leather, Beige\n]      36
[\nCloth, Brown\n]             28
[\nVelour\n]                   16
[\nOther, Grey\n]              15
[\nCloth, Beige\n]             13
[\nCloth, Blue\n]              12
[\nBrown\n]                    12
[\nVelour, Gre

In [40]:
def uphols_ext(data):
    new_lst = []
    for i in data:
        if i != '':
            new_lst.append(i.strip().strip("\n"))
    return ",".join(new_lst)

In [41]:
df["upholstery"] = df.upholstery[df["upholstery"].notnull()].apply(uphols_ext)

In [42]:
df.upholstery.value_counts(dropna = False)

Cloth, Black           5821
NaN                    3720
Part leather, Black    1121
Cloth                  1005
Cloth, Grey             891
Cloth, Other            639
Full leather, Black     575
Black                   491
Grey                    273
Other, Other            182
Part leather            140
Full leather            139
Full leather, Brown     116
Part leather, Grey      116
Other, Black            110
Full leather, Other      72
Full leather, Grey       67
Part leather, Other      65
Other                    56
Part leather, Brown      50
alcantara, Black         47
Velour, Black            36
Full leather, Beige      36
Cloth, Brown             28
Velour                   16
Other, Grey              15
Cloth, Beige             13
Cloth, Blue              12
Brown                    12
Cloth, White              8
Velour, Grey              8
alcantara, Grey           6
Cloth, Red                5
Other, Yellow             4
Part leather, Red         3
Beige               

In [43]:
df.type.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[, Used, , Diesel (Particulate Filter)]                                                                                  3475
[, Used, , Diesel]                                                                                                       2516
[, Used, , Gasoline]                                                                                                     2367
[, Used, , Super 95]                                                                                                     1818
[, Pre-registered, , Super 95]                                                                                            500
                                                                                                                         ... 
[, Used, , Regular/Benzine E10 91 / Super E10 95 / Super Plus E10 98 / Super Plus 98 / Super 95 / Regular/Benzine 91]       1
[, Used, , Regular/Benzine E10 91 / Regular/Benzine 91 / Super 95 / Super Plus 98 / Super E10 95 / Super Plus E10 98] 

In [44]:
def type_extractor(data):
    new_lst = []
    for i in data:
        if i != '':
            new_lst.append(i.strip().strip("\n"))
    return ",".join(new_lst)

In [45]:
df.type.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[, Used, , Diesel (Particulate Filter)]                                                                                  3475
[, Used, , Diesel]                                                                                                       2516
[, Used, , Gasoline]                                                                                                     2367
[, Used, , Super 95]                                                                                                     1818
[, Pre-registered, , Super 95]                                                                                            500
                                                                                                                         ... 
[, Used, , Regular/Benzine E10 91 / Super E10 95 / Super Plus E10 98 / Super Plus 98 / Super 95 / Regular/Benzine 91]       1
[, Used, , Regular/Benzine E10 91 / Regular/Benzine 91 / Super 95 / Super Plus 98 / Super E10 95 / Super Plus E10 98] 

In [46]:
df["type"] = df.type[df["type"].notnull()].apply(type_extractor)

In [47]:
df.type

0                          Used,Diesel (Particulate Filter)
1                                             Used,Gasoline
2                          Used,Diesel (Particulate Filter)
3                          Used,Diesel (Particulate Filter)
4                          Used,Diesel (Particulate Filter)
                                ...                        
15914                       New,Diesel (Particulate Filter)
15915    Used,Super 95 / Super Plus 98 (Particulate Filter)
15916                                 Pre-registered,Diesel
15917                                 Pre-registered,Diesel
15918                                Demonstration,Super 95
Name: type, Length: 15919, dtype: object

In [48]:
df.previous_owners

0        2.0
1        NaN
2        1.0
3        1.0
4        1.0
        ... 
15914    NaN
15915    1.0
15916    1.0
15917    NaN
15918    1.0
Name: previous_owners, Length: 15919, dtype: float64

In [49]:
df.prev_owner = df.prev_owner.str[0]

In [50]:
df.prev_owner.fillna(method = 'bfill').value_counts(dropna = False)

1    14692
2     1187
3       38
4        2
Name: prev_owner, dtype: int64

In [51]:
df.previous_owners.fillna(method = 'bfill').value_counts(dropna = False)

1.0    14169
2.0     1179
0.0      531
3.0       38
4.0        2
Name: previous_owners, dtype: int64

In [52]:
df.previous_owners.fillna(method = 'bfill', inplace = True)

In [53]:
df.drop(columns=['prev_owner', 'weight', 'non-smoking_vehicle'], inplace = True)

In [54]:
df.body_type.value_counts(dropna = False)

Sedans           7903
Station wagon    3553
Compact          3153
Van               783
Other             290
Transporter        88
NaN                60
Off-Road           56
Coupe              25
Convertible         8
Name: body_type, dtype: int64

In [55]:
df.body.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n, Sedans, \n]           7903
[\n, Station wagon, \n]    3553
[\n, Compact, \n]          3153
[\n, Van, \n]               783
[\n, Other, \n]             290
[\n, Transporter, \n]        88
NaN                          60
[\n, Off-Road, \n]           56
[\n, Coupe, \n]              25
[\n, Convertible, \n]         8
Name: body, dtype: int64

In [56]:
df.groupby('make_model')["body_type"].value_counts(dropna = False)

make_model      body_type    
Audi A1         Sedans           1538
                Compact          1039
                Station wagon      21
                Other              13
                Coupe               2
                Van                 1
Audi A2         Off-Road            1
Audi A3         Sedans           2598
                Station wagon     282
                Compact           182
                Other              16
                Convertible         8
                NaN                 7
                Coupe               4
Opel Astra      Station wagon    1211
                Sedans           1053
                Compact           185
                Other              67
                NaN                 7
                Coupe               2
                Off-Road            1
Opel Corsa      Compact          1230
                Sedans            875
                Other              87
                Coupe              13
                Tran

In [57]:
fill_most(df, "make_model", "body_type")

In [58]:
df.body_type.value_counts(dropna = False)

Sedans           7925
Station wagon    3563
Compact          3155
Van               809
Other             290
Transporter        88
Off-Road           56
Coupe              25
Convertible         8
Name: body_type, dtype: int64

In [59]:
df.drop(columns='body', inplace = True)

In [60]:
df.inspection_new.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                                                                                            11987
[\nYes\n, \nEuro 6\n]                                                                            523
\nYes\n                                                                                          362
[\nYes\n, \n102 g CO2/km (comb)\n]                                                               174
[\nYes\n, \n4 (Green)\n]                                                                         166
                                                                                               ...  
[\nYes\n, \n, 5.1 l/100 km (comb), \n, 6.4 l/100 km (city), \n, 4.3 l/100 km (country), \n]        1
[\nYes\n, \n, 6.3 l/100 km (comb), \n, \n, \n]                                                     1
[\nYes\n, \n89 g CO2/km (comb)\n]                                                                  1
[\nYes\n, \n, 4.7 l/100 km (comb), \n, \n, \n]                                             

In [61]:
df.inspection_new[15900]

'\nYes\n'

In [62]:
def inspec_ext(item):
    return item[0].strip("\n") if type(item) == list else item.strip().strip('\n')

In [63]:
df["inspection_new"] = df.inspection_new[df.inspection_new.notnull()].apply(inspec_ext)

In [64]:
df.inspection_new.value_counts(dropna = False)

NaN    11987
Yes     3932
Name: inspection_new, dtype: int64

In [65]:
df.inspection_new.fillna(value = "No", inplace = True)

In [66]:
df['inspection_new'] = df.inspection_new.map({'Yes':1, 'No':'0'})

In [67]:
df.inspection_new.value_counts(dropna = False)

0    11987
1     3932
Name: inspection_new, dtype: int64

In [68]:
df.next_inspection.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                                                                                                12384
\n04/2022\n                                                                                           62
\n03/2021\n                                                                                           38
\n03/2022\n                                                                                           36
\n06/2021\n                                                                                           34
                                                                                                   ...  
[\n04/2020\n, \n123 g CO2/km (comb)\n]                                                                 1
[\n08/2021\n, \n110 g CO2/km (comb)\n]                                                                 1
[\n04/2021\n, \n137 g CO2/km (comb)\n]                                                                 1
[\n04/2021\n, \n123 g CO2/km (comb)\n]                 

In [69]:
def lst_to_str(data):
    new_lst = []
    if type(data) == list:
        for i in data:
            new_lst.append(i.strip())
        return ",".join(new_lst)
    else:
        return data

In [70]:
df.next_inspection = df.next_inspection[df.next_inspection.notnull()].apply(lst_to_str)

In [71]:
df.next_inspection.value_counts(dropna = False)

NaN                                                                           12384
\n04/2022\n                                                                      62
\n03/2021\n                                                                      38
\n03/2022\n                                                                      36
\n06/2021\n                                                                      34
                                                                              ...  
12/2020,0 kWh/100 km (comb)                                                       1
03/2022,,6.6 l/100 km (comb),,8.3 l/100 km (city),,5.6 l/100 km (country),        1
01/2022,,5.7 l/100 km (comb),,7.3 l/100 km (city),,4.8 l/100 km (country),        1
01/2020,122 g CO2/km (comb)                                                       1
05/2019,,4.9 l/100 km (comb),,6.2 l/100 km (city),,4.2 l/100 km (country),        1
Name: next_inspection, Length: 1400, dtype: int64

In [72]:
df['next_inspection'] = df.next_inspection.str.extract("(\d\d/\d\d\d\d)")

In [73]:
df.next_inspection.value_counts(dropna = False)

NaN        12384
06/2021      471
03/2021      210
05/2021      180
04/2021      171
           ...  
01/2018        1
01/1999        1
04/2016        1
12/2017        1
01/1921        1
Name: next_inspection, Length: 78, dtype: int64

In [74]:
df.next_inspection = pd.to_datetime(df.next_inspection, format = '%m/%Y')

In [75]:
(df.paint_type.isnull().sum()/len(df.paint_type))*100

36.25855895470821

In [76]:
df.paint_type.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\nMetallic\n]       9794
NaN                  5772
[\nUni/basic\n]       347
[\nPerl effect\n]       6
Name: paint_type, dtype: int64

In [77]:
df.paint_type.str[0].str.strip('\n').value_counts(dropna = False)

Metallic       9794
NaN            5772
Uni/basic       347
Perl effect       6
Name: paint_type, dtype: int64

In [78]:
df.paint_type = df.paint_type.str[0].str.strip('\n')

In [79]:
df.paint_type.value_counts(dropna = False)

Metallic       9794
NaN            5772
Uni/basic       347
Perl effect       6
Name: paint_type, dtype: int64

In [80]:
(df.country_version.isnull().sum()/len(df.country_version))*100

52.346252905333245

In [81]:
df.country_version.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                     8333
[\nGermany\n]           4502
[\nItaly\n]             1038
[\nEuropean Union\n]     507
[\nNetherlands\n]        464
[\nSpain\n]              325
[\nBelgium\n]            314
[\nAustria\n]            208
[\nCzech Republic\n]      52
[\nPoland\n]              49
[\nFrance\n]              38
[\nDenmark\n]             33
[\nHungary\n]             28
[\nJapan\n]                8
[\nSlovakia\n]             4
[\nCroatia\n]              4
[\nSweden\n]               3
[\nRomania\n]              2
[\nBulgaria\n]             2
[\nSerbia\n]               1
[\nLuxembourg\n]           1
[\nSwitzerland\n]          1
[\nSlovenia\n]             1
[\nEgypt\n]                1
Name: country_version, dtype: int64

In [82]:
df.country_version = df.country_version.str[0].str.strip('\n')

In [83]:
df.country_version.value_counts(dropna = False)

NaN               8333
Germany           4502
Italy             1038
European Union     507
Netherlands        464
Spain              325
Belgium            314
Austria            208
Czech Republic      52
Poland              49
France              38
Denmark             33
Hungary             28
Japan                8
Croatia              4
Slovakia             4
Sweden               3
Romania              2
Bulgaria             2
Serbia               1
Slovenia             1
Switzerland          1
Egypt                1
Luxembourg           1
Name: country_version, dtype: int64

In [84]:
(df.cylinders.isnull().sum()/len(df.cylinders))*100

35.68063320560337

In [85]:
df.cylinders.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n4\n]    8105
NaN        5680
[\n3\n]    2104
[\n5\n]      22
[\n6\n]       3
[\n2\n]       2
[\n8\n]       2
[\n1\n]       1
Name: cylinders, dtype: int64

In [86]:
df.cylinders.str[0].str.strip('\n').value_counts(dropna = False)

4      8105
NaN    5680
3      2104
5        22
6         3
8         2
2         2
1         1
Name: cylinders, dtype: int64

In [87]:
df.cylinders = df.cylinders.str[0].str.strip('\n')

In [88]:
df.cylinders.value_counts(dropna = False)

4      8105
NaN    5680
3      2104
5        22
6         3
8         2
2         2
1         1
Name: cylinders, dtype: int64

In [89]:
df.emission_label.isnull().sum()/df.shape[0]

0.749670205414913

In [90]:
df.emission_label.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                     11934
[\n4 (Green)\n]          3553
[\n1 (No sticker)\n]      381
[[], [], []]               40
[\n5 (Blue)\n]              8
[\n3 (Yellow)\n]            2
[\n2 (Red)\n]               1
Name: emission_label, dtype: int64

In [91]:
df.emission_label.str[0].str.strip('\n').value_counts(dropna = False)

NaN               11974
4 (Green)          3553
1 (No sticker)      381
5 (Blue)              8
3 (Yellow)            2
2 (Red)               1
Name: emission_label, dtype: int64

In [92]:
df.emission_label = df.emission_label.str[0].str.strip('\n')

In [93]:
df.emission_label.value_counts(dropna = False)

NaN               11974
4 (Green)          3553
1 (No sticker)      381
5 (Blue)              8
3 (Yellow)            2
2 (Red)               1
Name: emission_label, dtype: int64

In [94]:
df.full_service.isnull().sum()/df.shape[0]

0.4839499968590992

In [95]:
df.full_service.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                                                                                           7704
[\n, \n, \n4 (Green)\n]                                                                       2235
[\n, \n, \nEuro 6\n]                                                                          2097
[\n, \n]                                                                                      1702
[\n, \n, \nEuro 6d-TEMP\n]                                                                     399
                                                                                              ... 
[\n, \n, \n, 6 l/100 km (comb), \n, 7.5 l/100 km (city), \n, 5.2 l/100 km (country), \n]         1
[\n, \n, \n, 5.5 l/100 km (comb), \n, 7.2 l/100 km (city), \n, 4.5 l/100 km (country), \n]       1
[\n, \n, \n, 5.8 l/100 km (comb), \n, 7.4 l/100 km (city), \n, 4.9 l/100 km (country), \n]       1
[\n, \n, \n, 5.4 l/100 km (comb), \n, 7 l/100 km (city), \n, 4.5 l/100 km (country), \n]         1
[\n, \n, \

In [96]:
df.drop(columns = 'full_service', inplace = True)

In [97]:
df["comfort&convenience"].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                                                                                                                                                                                                                                                                                                                                                                     920
[Air conditioning, Electrical side mirrors, Hill Holder, Power windows]                                                                                                                                                                                                                                                                                                 216
[Air conditioning, Electrical side mirrors, Power windows]                                                                                                                                                                                                                      

In [98]:
def unique_values(data, column_name):
    unique_vals = set()
    for rows in data[column_name]:
        unique_vals.update(rows)
    return list(unique_vals)

In [99]:
df[df['comfort&convenience'].isnull()]['comfort&convenience'].map({np.nan:[]})

9        []
30       []
53       []
168      []
191      []
         ..
15791    []
15804    []
15805    []
15854    []
15864    []
Name: comfort&convenience, Length: 920, dtype: object

In [100]:
def NaN_to_list(data, column_name):
    NaN_rows = data[column_name].isnull()
    data.loc[NaN_rows, column_name] = data.loc[NaN_rows, column_name].apply(lambda x: [])

In [101]:
NaN_to_list(df, 'comfort&convenience')
unique_values(df, 'comfort&convenience')

['Tinted windows',
 'Multi-function steering wheel',
 'Electric tailgate',
 'Panorama roof',
 'Lumbar support',
 'Heated steering wheel',
 'Sunroof',
 'Massage seats',
 'Park Distance Control',
 'Light sensor',
 'Split rear seats',
 'Parking assist system camera',
 'Parking assist system self-steering',
 'Seat ventilation',
 'Air suspension',
 'Start-stop system',
 'Parking assist system sensors rear',
 'Electrically adjustable seats',
 'Power windows',
 'Wind deflector',
 'Windshield',
 'Leather seats',
 'Cruise control',
 'Keyless central door lock',
 'Electric Starter',
 'Air conditioning',
 'Navigation system',
 'Rain sensor',
 'Electrically heated windshield',
 'Armrest',
 'Auxiliary heating',
 'Electrical side mirrors',
 'Hill Holder',
 'Automatic climate control',
 'Leather steering wheel',
 'Heads-up display',
 'Parking assist system sensors front',
 'Seat heating']

In [102]:
nxtins_list = [",".join(item).strip() if type(item) == list else item for item in df['comfort&convenience']]

In [103]:
df['comfort&convenience'] = nxtins_list

In [104]:
df['comfort&convenience'].value_counts(dropna = False)

                                                                                                                                                                                                                                                                                                                                                      920
Air conditioning,Electrical side mirrors,Hill Holder,Power windows                                                                                                                                                                                                                                                                                    216
Air conditioning,Electrical side mirrors,Power windows                                                                                                                                                                                                                                                              

In [105]:
df['comfort&convenience'] = df['comfort&convenience'].mask(df['comfort&convenience'] == '')

In [106]:
df['comfort&convenience'].isnull().sum()

920

In [107]:
df['comfort&convenience'].value_counts(dropna = False)

NaN                                                                                                                                                                                                                                                                                                                                                   920
Air conditioning,Electrical side mirrors,Hill Holder,Power windows                                                                                                                                                                                                                                                                                    216
Air conditioning,Electrical side mirrors,Power windows                                                                                                                                                                                                                                                              

In [108]:
fill_most(df, 'make_model', 'comfort&convenience')

In [109]:
df['comfort&convenience'].isnull().sum()

0

In [110]:
df['comfort&convenience'].str.get_dummies(sep=',')

Unnamed: 0,Air conditioning,Air suspension,Armrest,Automatic climate control,Auxiliary heating,Cruise control,Electric Starter,Electric tailgate,Electrical side mirrors,Electrically adjustable seats,Electrically heated windshield,Heads-up display,Heated steering wheel,Hill Holder,Keyless central door lock,Leather seats,Leather steering wheel,Light sensor,Lumbar support,Massage seats,Multi-function steering wheel,Navigation system,Panorama roof,Park Distance Control,Parking assist system camera,Parking assist system self-steering,Parking assist system sensors front,Parking assist system sensors rear,Power windows,Rain sensor,Seat heating,Seat ventilation,Split rear seats,Start-stop system,Sunroof,Tinted windows,Wind deflector,Windshield
0,1,0,1,1,0,1,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,1,0,1,0,0,0,1,1,1,1,0,0,1,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0
2,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,1,0,1,0,0,1,1,1,0,1,0,0,1,0,0,0,0
3,0,1,1,0,1,0,0,0,1,0,0,1,0,1,0,0,1,1,1,0,1,1,0,0,0,0,0,0,1,1,1,0,1,1,0,0,0,0
4,1,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,0,1,1,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15914,1,0,0,1,0,1,0,0,1,1,0,0,0,1,1,0,1,1,0,1,1,1,0,1,0,0,1,1,1,1,0,0,1,1,0,1,0,0
15915,1,0,0,1,0,1,0,1,1,0,1,1,1,1,1,0,1,1,1,0,0,1,0,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0
15916,1,0,1,1,0,1,0,1,1,0,1,1,1,0,0,0,1,1,0,1,1,1,1,1,1,0,1,1,1,1,1,1,0,0,1,0,0,0
15917,1,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0


In [111]:
df = df.join(df['comfort&convenience'].str.get_dummies(sep=',').add_prefix('cc_') )

In [112]:
df.shape

(15919, 78)

In [113]:
df['entertainment&media'].isnull().sum()/df.shape[0]

0.0863119542684842

In [114]:
df['entertainment&media'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                                                                                      1374
[Bluetooth, Hands-free equipment, On-board computer, Radio, USB]                         1282
[Bluetooth, Hands-free equipment, MP3, On-board computer, Radio, USB]                     982
[Bluetooth, CD player, Hands-free equipment, MP3, On-board computer, Radio, USB]          783
[On-board computer, Radio]                                                                487
                                                                                         ... 
[Bluetooth, MP3, On-board computer, Sound system, USB]                                      1
[CD player, Hands-free equipment, On-board computer, Radio, Sound system, Television]       1
[Bluetooth, Digital radio, On-board computer, Sound system, USB]                            1
[CD player, Hands-free equipment, MP3, Radio, USB]                                          1
[CD player, Digital radio, Radio]                           

In [115]:
NaN_to_list(df, 'entertainment&media')

In [116]:
unique_values(df, 'entertainment&media')

['On-board computer',
 'USB',
 'Radio',
 'Sound system',
 'MP3',
 'Television',
 'Bluetooth',
 'CD player',
 'Hands-free equipment',
 'Digital radio']

In [117]:
ent_med_list = [",".join(item).strip() if type(item) == list else item for item in df['entertainment&media']]

In [118]:
df['entertainment&media'] = pd.DataFrame(ent_med_list)

In [119]:
df['entertainment&media'].value_counts(dropna = False)

                                                                               1374
Bluetooth,Hands-free equipment,On-board computer,Radio,USB                     1282
Bluetooth,Hands-free equipment,MP3,On-board computer,Radio,USB                  982
Bluetooth,CD player,Hands-free equipment,MP3,On-board computer,Radio,USB        783
On-board computer,Radio                                                         487
                                                                               ... 
Bluetooth,Sound system,USB                                                        1
Bluetooth,Digital radio,Hands-free equipment,Sound system,USB                     1
Bluetooth,CD player,USB                                                           1
Bluetooth,CD player,Hands-free equipment,MP3,On-board computer,Sound system       1
CD player,Digital radio,Hands-free equipment,MP3,Radio                            1
Name: entertainment&media, Length: 347, dtype: int64

In [120]:
df['entertainment&media'] = df['entertainment&media'].mask(df['entertainment&media'] == '')

In [121]:
df['entertainment&media'].value_counts(dropna = False)

NaN                                                                            1374
Bluetooth,Hands-free equipment,On-board computer,Radio,USB                     1282
Bluetooth,Hands-free equipment,MP3,On-board computer,Radio,USB                  982
Bluetooth,CD player,Hands-free equipment,MP3,On-board computer,Radio,USB        783
On-board computer,Radio                                                         487
                                                                               ... 
Bluetooth,Sound system,USB                                                        1
Bluetooth,Digital radio,Hands-free equipment,Sound system,USB                     1
Bluetooth,CD player,USB                                                           1
Bluetooth,CD player,Hands-free equipment,MP3,On-board computer,Sound system       1
CD player,Digital radio,Hands-free equipment,MP3,Radio                            1
Name: entertainment&media, Length: 347, dtype: int64

In [122]:
fill_most(df, 'make_model', 'entertainment&media')

In [123]:
df['entertainment&media'].value_counts(dropna = False)

Bluetooth,Hands-free equipment,On-board computer,Radio,USB                                          2015
Bluetooth,Hands-free equipment,MP3,On-board computer,Radio,USB                                      1206
Bluetooth,CD player,Hands-free equipment,MP3,On-board computer,Radio,USB                            1049
Bluetooth,Hands-free equipment,On-board computer,Radio                                               529
On-board computer,Radio                                                                              487
                                                                                                    ... 
Bluetooth,CD player,Hands-free equipment,MP3,On-board computer,Radio,Sound system,Television,USB       1
MP3,Radio,Sound system,USB                                                                             1
Bluetooth,Digital radio,Hands-free equipment,MP3,Radio                                                 1
Bluetooth,CD player,Digital radio,On-board computer,Rad

In [124]:
df['entertainment&media'].str.get_dummies(sep = ',')

Unnamed: 0,Bluetooth,CD player,Digital radio,Hands-free equipment,MP3,On-board computer,Radio,Sound system,Television,USB
0,1,0,0,1,0,1,1,0,0,0
1,1,0,0,1,0,1,1,1,0,0
2,0,0,0,0,1,1,0,0,0,0
3,1,1,0,1,1,1,1,1,0,1
4,1,1,0,1,1,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
15914,1,0,1,1,0,1,1,0,0,1
15915,1,0,1,1,0,0,1,0,0,1
15916,1,0,0,1,0,1,1,0,0,0
15917,1,0,1,0,0,0,1,0,0,1


In [125]:
df = df.join(df['entertainment&media'].str.get_dummies(sep = ',').add_prefix('em_'))

In [126]:
df.head()

Unnamed: 0,make_model,short_description,body_type,price,vat,km,registration,hp,type,previous_owners,next_inspection,inspection_new,warranty,make,model,offer_number,first_registration,body_color,paint_type,body_color_original,upholstery,nr_of_doors,nr_of_seats,model_code,gearing_type,displacement,cylinders,drive_chain,fuel,consumption,co2_emission,emission_class,comfort&convenience,entertainment&media,extras,safety&security,description,emission_label,gears,country_version,cc_Air conditioning,cc_Air suspension,cc_Armrest,cc_Automatic climate control,cc_Auxiliary heating,cc_Cruise control,cc_Electric Starter,cc_Electric tailgate,cc_Electrical side mirrors,cc_Electrically adjustable seats,cc_Electrically heated windshield,cc_Heads-up display,cc_Heated steering wheel,cc_Hill Holder,cc_Keyless central door lock,cc_Leather seats,cc_Leather steering wheel,cc_Light sensor,cc_Lumbar support,cc_Massage seats,cc_Multi-function steering wheel,cc_Navigation system,cc_Panorama roof,cc_Park Distance Control,cc_Parking assist system camera,cc_Parking assist system self-steering,cc_Parking assist system sensors front,cc_Parking assist system sensors rear,cc_Power windows,cc_Rain sensor,cc_Seat heating,cc_Seat ventilation,cc_Split rear seats,cc_Start-stop system,cc_Sunroof,cc_Tinted windows,cc_Wind deflector,cc_Windshield,em_Bluetooth,em_CD player,em_Digital radio,em_Hands-free equipment,em_MP3,em_On-board computer,em_Radio,em_Sound system,em_Television,em_USB
0,Audi A1,Sportback 1.4 TDI S-tronic Xenon Navi Klima,Sedans,15770,VAT deductible,56.013,01/2016,66 kW,"Used,Diesel (Particulate Filter)",2.0,2021-06-01,1,"[\n, \n, \n4 (Green)\n]",Audi,A1,[\nLR-062483\n],2016,Black,Metallic,[\nMythosschwarz\n],"Cloth, Black",[\n5\n],[\n5\n],[\n0588/BDF\n],"[\n, Automatic, \n]","[\n1,422 cc\n]",3.0,[\nfront\n],"[\n, Diesel (Particulate Filter), \n]","[[3.8 l/100 km (comb)], [4.3 l/100 km (city)], [3.5 l/100 km (country)]]",[\n99 g CO2/km (comb)\n],[\nEuro 6\n],"Air conditioning,Armrest,Automatic climate control,Cruise control,Electrical side mirrors,Hill Holder,Leather steering wheel,Light sensor,Multi-function steering wheel,Navigation system,Park Distance Control,Parking assist system sensors rear,Power windows,Rain sensor,Seat heating,Start-stop system","Bluetooth,Hands-free equipment,On-board computer,Radio","[Alloy wheels, Catalytic Converter, Voice Control]","[ABS, Central door lock, Daytime running lights, Driver-side airbag, Electronic stability control, Fog lights, Immobilizer, Isofix, Passenger-side airbag, Power steering, Side airbag, Tire pressure monitoring system, Traction control, Xenon headlights]","[\n, Sicherheit:, , Deaktivierung für Beifahrer-Airbag, , ESC mit elektronischer Quersperre, , Tagfahrlicht, , Reifendruck-Kontrollanzeige, , Kopfairbag-System mit Seiten-Airbags vorn, , Sicherheitslenksäule, Assistenzsysteme:, , Berganfahrassistent, Komfort:, , Scheinwerferreinigung, , Xenon plus inklusive Scheinwerfer-Reinigungsanlage, , Scheinwerfer-Reinigungsanlage, , Einparkhilfe hinten, , Licht-/Regensensor, , Funkfernbedienung, , Elektrische Luftzusatzheizung, Interieur:...",,,,1,0,1,1,0,1,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,1,0,1,0,0,0,1,1,1,1,0,0,1,0,0,0,0,1,0,0,1,0,1,1,0,0,0
1,Audi A1,1.8 TFSI sport,Sedans,14500,Price negotiable,80.0,03/2017,141 kW,"Used,Gasoline",1.0,NaT,0,,Audi,A1,,2017,Red,,,"Cloth, Grey",[\n3\n],[\n4\n],[\n0588/BCY\n],"[\n, Automatic, \n]","[\n1,798 cc\n]",4.0,[\nfront\n],"[\n, Gasoline, \n]","[[5.6 l/100 km (comb)], [7.1 l/100 km (city)], [4.7 l/100 km (country)]]",[\n129 g CO2/km (comb)\n],[\nEuro 6\n],"Air conditioning,Automatic climate control,Hill Holder,Leather steering wheel,Lumbar support,Parking assist system sensors rear,Power windows,Start-stop system,Tinted windows","Bluetooth,Hands-free equipment,On-board computer,Radio,Sound system","[Alloy wheels, Sport seats, Sport suspension, Voice Control]","[ABS, Central door lock, Central door lock with remote control, Daytime running lights, Driver-side airbag, Electronic stability control, Head airbag, Immobilizer, Isofix, Passenger-side airbag, Power steering, Side airbag, Tire pressure monitoring system, Traction control, Xenon headlights]","[\nLangstreckenfahrzeug daher die hohe Kilometerleistung. , Bremsen neu bei 62000 km. , Gut gepflegter A1\n]",4 (Green),[\n7\n],,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0,1,0,0,1,0,1,1,1,0,0
2,Audi A1,Sportback 1.6 TDI S tronic Einparkhilfe plus+music,Sedans,14640,VAT deductible,83.45,02/2016,85 kW,"Used,Diesel (Particulate Filter)",1.0,NaT,0,"[\n, \n, \n99 g CO2/km (comb)\n]",Audi,A1,[\nAM-95365\n],2016,Black,Metallic,[\nmythosschwarz metallic\n],"Cloth, Black",[\n4\n],[\n4\n],,"[\n, Automatic, \n]","[\n1,598 cc\n]",,[\nfront\n],"[\n, Diesel (Particulate Filter), \n]","[[3.8 l/100 km (comb)], [4.4 l/100 km (city)], [3.4 l/100 km (country)]]",[\n99 g CO2/km (comb)\n],[\nEuro 6\n],"Air conditioning,Cruise control,Electrical side mirrors,Hill Holder,Leather steering wheel,Multi-function steering wheel,Navigation system,Park Distance Control,Parking assist system sensors front,Parking assist system sensors rear,Power windows,Seat heating,Start-stop system","MP3,On-board computer","[Alloy wheels, Voice Control]","[ABS, Central door lock, Daytime running lights, Driver-side airbag, Electronic stability control, Immobilizer, Isofix, Passenger-side airbag, Power steering, Side airbag, Tire pressure monitoring system, Traction control]","[\n, Fahrzeug-Nummer: AM-95365, , Ehem. UPE 24.640 EUR, , Komfort, , Klimaanlage, , Elektrische Fensterheber, , Sitzheizung, , Tempomat, , Zentralverriegelung, , Leder-Lenkrad, , Elektrische Spiegel, , Wärmedämmendes Glas, , Anfahrassistent, , Sicherheit, , Window/Kopfairbags, , ESP (el. Stabilitäts Programm), , Elektronische Wegfahrsperre, , Reifendruckkontrolle, , Tagfahrlicht, , Exterieur, , Alufelgen 16 Zoll, , Einparkhilfe vo + hi, , Interieur, , Stoff Zeitgeist, ...",4 (Green),,,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,1,0,1,0,0,1,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0
3,Audi A1,1.4 TDi Design S tronic,Sedans,14500,,73.0,08/2016,66 kW,"Used,Diesel (Particulate Filter)",1.0,NaT,0,,Audi,A1,,2016,Brown,Metallic,,,[\n3\n],[\n4\n],,"[\n, Automatic, \n]","[\n1,422 cc\n]",3.0,,"[\n, Diesel (Particulate Filter), \n]","[[3.8 l/100 km (comb)], [4.3 l/100 km (city)], [3.5 l/100 km (country)]]",[\n99 g CO2/km (comb)\n],[\nEuro 6\n],"Air suspension,Armrest,Auxiliary heating,Electrical side mirrors,Heads-up display,Hill Holder,Leather steering wheel,Light sensor,Lumbar support,Multi-function steering wheel,Navigation system,Power windows,Rain sensor,Seat heating,Split rear seats,Start-stop system","Bluetooth,CD player,Hands-free equipment,MP3,On-board computer,Radio,Sound system,USB","[Alloy wheels, Sport seats, Voice Control]","[ABS, Alarm system, Central door lock with remote control, Driver drowsiness detection, Driver-side airbag, Electronic stability control, Emergency system, Head airbag, Immobilizer, Isofix, Passenger-side airbag, Power steering, Side airbag, Tire pressure monitoring system]","[\nAudi A1: , - 1e eigenaar , - Perfecte staat: schade/ongevalvrij , - Onderhoudsboekje Audi garage , - Full option!, - S-line opties, - Automaat, lederen bekleding, navi-pro, Xenon lampen, zetelverwarming, automatische airco,...\n]",,[\n6\n],,0,1,1,0,1,0,0,0,1,0,0,1,0,1,0,0,1,1,1,0,1,1,0,0,0,0,0,0,1,1,1,0,1,1,0,0,0,0,1,1,0,1,1,1,1,1,0,1
4,Audi A1,"Sportback 1.4 TDI S-Tronic S-Line Ext. admired, Xe",Sedans,16790,,16.2,05/2016,66 kW,"Used,Diesel (Particulate Filter)",1.0,NaT,1,"[\n, \n, \nEuro 6\n]",Audi,A1,[\nC1626\n],2016,Black,Metallic,[\nMythosschwarz Metallic\n],"Cloth, Black",[\n5\n],[\n5\n],[\n0588/BDF\n],"[\n, Automatic, \n]","[\n1,422 cc\n]",3.0,[\nfront\n],"[\n, Diesel (Particulate Filter), \n]","[[4.1 l/100 km (comb)], [4.6 l/100 km (city)], [3.8 l/100 km (country)]]",[\n109 g CO2/km (comb)\n],[\nEuro 6\n],"Air conditioning,Armrest,Automatic climate control,Electrical side mirrors,Hill Holder,Leather steering wheel,Light sensor,Multi-function steering wheel,Park Distance Control,Parking assist system sensors rear,Power windows,Rain sensor,Start-stop system","Bluetooth,CD player,Hands-free equipment,MP3,On-board computer,Radio,USB","[Alloy wheels, Sport package, Sport suspension, Voice Control]","[ABS, Central door lock, Driver-side airbag, Electronic stability control, Fog lights, Immobilizer, Isofix, Passenger-side airbag, Power steering, Side airbag, Tire pressure monitoring system, Traction control, Xenon headlights]","[\n, Technik & Sicherheit:, Xenon plus, Klimaautomatik, Einparkhilfe hinten, Scheinwerferreinigungsanlage, Nebelscheinwerfer, Zentralverriegelung mit Fernbedienung, Außenspiegel elek. einstellbar, Fensterheber elek, Start/Stop Anlage, 6 Airbag inkl. Kopfairbag, Euro 6, Multimedia:, Radio CD Concert, audi music interface, Handyvorbereitung (Bluetooth), Sprachsteuerung, USB Anschluss, Assistenzsysteme:, Lichtsensor, Regensensor, Berganfahrassistent, Reifendruckkontrolle, ESP inkl. ABS, Interie...",,,Germany,1,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,0,1,1,1,0,0,0,1,0,0,0,0,1,1,0,1,1,1,1,0,0,1


In [127]:
df['extras'].isnull().sum()/df.shape[0]

0.18606696400527672

In [128]:
df["extras"] = df[df["extras"].notnull()]["extras"].apply(list_extractor)

In [129]:
df['extras'].value_counts(dropna = False)

Alloy wheels                                                                                                   3245
NaN                                                                                                            2962
Alloy wheels,Touch screen                                                                                       697
Alloy wheels,Voice Control                                                                                      577
Alloy wheels,Touch screen,Voice Control                                                                         541
                                                                                                               ... 
Alloy wheels,Cab or rented Car,Catalytic Converter,Sport package,Sport seats,Sport suspension,Voice Control       1
Alloy wheels,Catalytic Converter,Sport seats,Touch screen,Trailer hitch,Voice Control                             1
Alloy wheels,Catalytic Converter,Shift paddles,Trailer hitch,Voice Contr

In [130]:
fill_most(df, 'make_model', 'extras')

In [131]:
df.extras.value_counts(dropna = False)

Alloy wheels                                                                        6174
Alloy wheels,Touch screen                                                            697
Alloy wheels,Voice Control                                                           577
Alloy wheels,Touch screen,Voice Control                                              541
Alloy wheels,Roof rack                                                               385
                                                                                    ... 
Alloy wheels,Roof rack,Ski bag,Sport suspension,Touch screen,Voice Control             1
Alloy wheels,Handicapped enabled,Ski bag,Voice Control,Winter tyres                    1
Alloy wheels,Catalytic Converter,Sport package,Trailer hitch                           1
Shift paddles,Sport seats,Trailer hitch                                                1
Alloy wheels,Catalytic Converter,Roof rack,Sport seats,Touch screen,Winter tyres       1
Name: extras, Length:

In [132]:
df = df.join(df['extras'].str.get_dummies(sep = ',').add_prefix('ex_'))

In [133]:
df.head()

Unnamed: 0,make_model,short_description,body_type,price,vat,km,registration,hp,type,previous_owners,next_inspection,inspection_new,warranty,make,model,offer_number,first_registration,body_color,paint_type,body_color_original,upholstery,nr_of_doors,nr_of_seats,model_code,gearing_type,displacement,cylinders,drive_chain,fuel,consumption,co2_emission,emission_class,comfort&convenience,entertainment&media,extras,safety&security,description,emission_label,gears,country_version,cc_Air conditioning,cc_Air suspension,cc_Armrest,cc_Automatic climate control,cc_Auxiliary heating,cc_Cruise control,cc_Electric Starter,cc_Electric tailgate,cc_Electrical side mirrors,cc_Electrically adjustable seats,cc_Electrically heated windshield,cc_Heads-up display,cc_Heated steering wheel,cc_Hill Holder,cc_Keyless central door lock,cc_Leather seats,cc_Leather steering wheel,cc_Light sensor,cc_Lumbar support,cc_Massage seats,cc_Multi-function steering wheel,cc_Navigation system,cc_Panorama roof,cc_Park Distance Control,cc_Parking assist system camera,cc_Parking assist system self-steering,cc_Parking assist system sensors front,cc_Parking assist system sensors rear,cc_Power windows,cc_Rain sensor,cc_Seat heating,cc_Seat ventilation,cc_Split rear seats,cc_Start-stop system,cc_Sunroof,cc_Tinted windows,cc_Wind deflector,cc_Windshield,em_Bluetooth,em_CD player,em_Digital radio,em_Hands-free equipment,em_MP3,em_On-board computer,em_Radio,em_Sound system,em_Television,em_USB,ex_Alloy wheels,ex_Cab or rented Car,ex_Catalytic Converter,ex_Handicapped enabled,ex_Right hand drive,ex_Roof rack,ex_Shift paddles,ex_Ski bag,ex_Sliding door,ex_Sport package,ex_Sport seats,ex_Sport suspension,ex_Touch screen,ex_Trailer hitch,ex_Tuned car,ex_Voice Control,ex_Winter tyres
0,Audi A1,Sportback 1.4 TDI S-tronic Xenon Navi Klima,Sedans,15770,VAT deductible,56.013,01/2016,66 kW,"Used,Diesel (Particulate Filter)",2.0,2021-06-01,1,"[\n, \n, \n4 (Green)\n]",Audi,A1,[\nLR-062483\n],2016,Black,Metallic,[\nMythosschwarz\n],"Cloth, Black",[\n5\n],[\n5\n],[\n0588/BDF\n],"[\n, Automatic, \n]","[\n1,422 cc\n]",3.0,[\nfront\n],"[\n, Diesel (Particulate Filter), \n]","[[3.8 l/100 km (comb)], [4.3 l/100 km (city)], [3.5 l/100 km (country)]]",[\n99 g CO2/km (comb)\n],[\nEuro 6\n],"Air conditioning,Armrest,Automatic climate control,Cruise control,Electrical side mirrors,Hill Holder,Leather steering wheel,Light sensor,Multi-function steering wheel,Navigation system,Park Distance Control,Parking assist system sensors rear,Power windows,Rain sensor,Seat heating,Start-stop system","Bluetooth,Hands-free equipment,On-board computer,Radio","Alloy wheels,Catalytic Converter,Voice Control","[ABS, Central door lock, Daytime running lights, Driver-side airbag, Electronic stability control, Fog lights, Immobilizer, Isofix, Passenger-side airbag, Power steering, Side airbag, Tire pressure monitoring system, Traction control, Xenon headlights]","[\n, Sicherheit:, , Deaktivierung für Beifahrer-Airbag, , ESC mit elektronischer Quersperre, , Tagfahrlicht, , Reifendruck-Kontrollanzeige, , Kopfairbag-System mit Seiten-Airbags vorn, , Sicherheitslenksäule, Assistenzsysteme:, , Berganfahrassistent, Komfort:, , Scheinwerferreinigung, , Xenon plus inklusive Scheinwerfer-Reinigungsanlage, , Scheinwerfer-Reinigungsanlage, , Einparkhilfe hinten, , Licht-/Regensensor, , Funkfernbedienung, , Elektrische Luftzusatzheizung, Interieur:...",,,,1,0,1,1,0,1,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,1,0,1,0,0,0,1,1,1,1,0,0,1,0,0,0,0,1,0,0,1,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,Audi A1,1.8 TFSI sport,Sedans,14500,Price negotiable,80.0,03/2017,141 kW,"Used,Gasoline",1.0,NaT,0,,Audi,A1,,2017,Red,,,"Cloth, Grey",[\n3\n],[\n4\n],[\n0588/BCY\n],"[\n, Automatic, \n]","[\n1,798 cc\n]",4.0,[\nfront\n],"[\n, Gasoline, \n]","[[5.6 l/100 km (comb)], [7.1 l/100 km (city)], [4.7 l/100 km (country)]]",[\n129 g CO2/km (comb)\n],[\nEuro 6\n],"Air conditioning,Automatic climate control,Hill Holder,Leather steering wheel,Lumbar support,Parking assist system sensors rear,Power windows,Start-stop system,Tinted windows","Bluetooth,Hands-free equipment,On-board computer,Radio,Sound system","Alloy wheels,Sport seats,Sport suspension,Voice Control","[ABS, Central door lock, Central door lock with remote control, Daytime running lights, Driver-side airbag, Electronic stability control, Head airbag, Immobilizer, Isofix, Passenger-side airbag, Power steering, Side airbag, Tire pressure monitoring system, Traction control, Xenon headlights]","[\nLangstreckenfahrzeug daher die hohe Kilometerleistung. , Bremsen neu bei 62000 km. , Gut gepflegter A1\n]",4 (Green),[\n7\n],,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0,1,0,0,1,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0
2,Audi A1,Sportback 1.6 TDI S tronic Einparkhilfe plus+music,Sedans,14640,VAT deductible,83.45,02/2016,85 kW,"Used,Diesel (Particulate Filter)",1.0,NaT,0,"[\n, \n, \n99 g CO2/km (comb)\n]",Audi,A1,[\nAM-95365\n],2016,Black,Metallic,[\nmythosschwarz metallic\n],"Cloth, Black",[\n4\n],[\n4\n],,"[\n, Automatic, \n]","[\n1,598 cc\n]",,[\nfront\n],"[\n, Diesel (Particulate Filter), \n]","[[3.8 l/100 km (comb)], [4.4 l/100 km (city)], [3.4 l/100 km (country)]]",[\n99 g CO2/km (comb)\n],[\nEuro 6\n],"Air conditioning,Cruise control,Electrical side mirrors,Hill Holder,Leather steering wheel,Multi-function steering wheel,Navigation system,Park Distance Control,Parking assist system sensors front,Parking assist system sensors rear,Power windows,Seat heating,Start-stop system","MP3,On-board computer","Alloy wheels,Voice Control","[ABS, Central door lock, Daytime running lights, Driver-side airbag, Electronic stability control, Immobilizer, Isofix, Passenger-side airbag, Power steering, Side airbag, Tire pressure monitoring system, Traction control]","[\n, Fahrzeug-Nummer: AM-95365, , Ehem. UPE 24.640 EUR, , Komfort, , Klimaanlage, , Elektrische Fensterheber, , Sitzheizung, , Tempomat, , Zentralverriegelung, , Leder-Lenkrad, , Elektrische Spiegel, , Wärmedämmendes Glas, , Anfahrassistent, , Sicherheit, , Window/Kopfairbags, , ESP (el. Stabilitäts Programm), , Elektronische Wegfahrsperre, , Reifendruckkontrolle, , Tagfahrlicht, , Exterieur, , Alufelgen 16 Zoll, , Einparkhilfe vo + hi, , Interieur, , Stoff Zeitgeist, ...",4 (Green),,,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,1,0,1,0,0,1,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,Audi A1,1.4 TDi Design S tronic,Sedans,14500,,73.0,08/2016,66 kW,"Used,Diesel (Particulate Filter)",1.0,NaT,0,,Audi,A1,,2016,Brown,Metallic,,,[\n3\n],[\n4\n],,"[\n, Automatic, \n]","[\n1,422 cc\n]",3.0,,"[\n, Diesel (Particulate Filter), \n]","[[3.8 l/100 km (comb)], [4.3 l/100 km (city)], [3.5 l/100 km (country)]]",[\n99 g CO2/km (comb)\n],[\nEuro 6\n],"Air suspension,Armrest,Auxiliary heating,Electrical side mirrors,Heads-up display,Hill Holder,Leather steering wheel,Light sensor,Lumbar support,Multi-function steering wheel,Navigation system,Power windows,Rain sensor,Seat heating,Split rear seats,Start-stop system","Bluetooth,CD player,Hands-free equipment,MP3,On-board computer,Radio,Sound system,USB","Alloy wheels,Sport seats,Voice Control","[ABS, Alarm system, Central door lock with remote control, Driver drowsiness detection, Driver-side airbag, Electronic stability control, Emergency system, Head airbag, Immobilizer, Isofix, Passenger-side airbag, Power steering, Side airbag, Tire pressure monitoring system]","[\nAudi A1: , - 1e eigenaar , - Perfecte staat: schade/ongevalvrij , - Onderhoudsboekje Audi garage , - Full option!, - S-line opties, - Automaat, lederen bekleding, navi-pro, Xenon lampen, zetelverwarming, automatische airco,...\n]",,[\n6\n],,0,1,1,0,1,0,0,0,1,0,0,1,0,1,0,0,1,1,1,0,1,1,0,0,0,0,0,0,1,1,1,0,1,1,0,0,0,0,1,1,0,1,1,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
4,Audi A1,"Sportback 1.4 TDI S-Tronic S-Line Ext. admired, Xe",Sedans,16790,,16.2,05/2016,66 kW,"Used,Diesel (Particulate Filter)",1.0,NaT,1,"[\n, \n, \nEuro 6\n]",Audi,A1,[\nC1626\n],2016,Black,Metallic,[\nMythosschwarz Metallic\n],"Cloth, Black",[\n5\n],[\n5\n],[\n0588/BDF\n],"[\n, Automatic, \n]","[\n1,422 cc\n]",3.0,[\nfront\n],"[\n, Diesel (Particulate Filter), \n]","[[4.1 l/100 km (comb)], [4.6 l/100 km (city)], [3.8 l/100 km (country)]]",[\n109 g CO2/km (comb)\n],[\nEuro 6\n],"Air conditioning,Armrest,Automatic climate control,Electrical side mirrors,Hill Holder,Leather steering wheel,Light sensor,Multi-function steering wheel,Park Distance Control,Parking assist system sensors rear,Power windows,Rain sensor,Start-stop system","Bluetooth,CD player,Hands-free equipment,MP3,On-board computer,Radio,USB","Alloy wheels,Sport package,Sport suspension,Voice Control","[ABS, Central door lock, Driver-side airbag, Electronic stability control, Fog lights, Immobilizer, Isofix, Passenger-side airbag, Power steering, Side airbag, Tire pressure monitoring system, Traction control, Xenon headlights]","[\n, Technik & Sicherheit:, Xenon plus, Klimaautomatik, Einparkhilfe hinten, Scheinwerferreinigungsanlage, Nebelscheinwerfer, Zentralverriegelung mit Fernbedienung, Außenspiegel elek. einstellbar, Fensterheber elek, Start/Stop Anlage, 6 Airbag inkl. Kopfairbag, Euro 6, Multimedia:, Radio CD Concert, audi music interface, Handyvorbereitung (Bluetooth), Sprachsteuerung, USB Anschluss, Assistenzsysteme:, Lichtsensor, Regensensor, Berganfahrassistent, Reifendruckkontrolle, ESP inkl. ABS, Interie...",,,Germany,1,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,0,1,1,1,0,0,0,1,0,0,0,0,1,1,0,1,1,1,1,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0


In [134]:
df['safety&security'].isnull().sum()/df.shape[0]

0.06168729191532132

In [135]:
df['safety&security'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                                                                                                                                                                                                                                                                                                                                                                                        982
[ABS, Central door lock, Daytime running lights, Driver-side airbag, Electronic stability control, Fog lights, Immobilizer, Isofix, Passenger-side airbag, Power steering, Side airbag, Tire pressure monitoring system, Traction control]                                                                                                                                                 538
[ABS, Central door lock, Daytime running lights, Driver-side airbag, Electronic stability control, Immobilizer, Isofix, Passenger-side airbag, Power steering, Side airbag, Tire pressure monitoring system, Traction control]            

In [136]:
df['safety&security'] = df[df['safety&security'].notnull()]['safety&security'].apply(list_extractor)

In [137]:
df['safety&security'].value_counts(dropna = False)

NaN                                                                                                                                                                                                                                                                                                                                                                                                                                                           982
ABS,Central door lock,Daytime running lights,Driver-side airbag,Electronic stability control,Fog lights,Immobilizer,Isofix,Passenger-side airbag,Power steering,Side airbag,Tire pressure monitoring system,Traction control                                                                                                                                                                                                                                  538
ABS,Central door lock,Daytime running lights,Driver-side airbag,Electronic stability control,Immobil

In [138]:
fill_most(df, 'make_model', 'safety&security')

In [139]:
df['safety&security'].value_counts(dropna = False)

ABS,Central door lock,Daytime running lights,Driver-side airbag,Electronic stability control,Fog lights,Immobilizer,Isofix,Passenger-side airbag,Power steering,Side airbag,Tire pressure monitoring system,Traction control                                                                                                                                                                                                                                  825
ABS,Central door lock,Daytime running lights,Driver-side airbag,Electronic stability control,Immobilizer,Isofix,Passenger-side airbag,Power steering,Side airbag,Tire pressure monitoring system,Traction control                                                                                                                                                                                                                                             480
ABS,Central door lock,Daytime running lights,Driver-side airbag,Electronic stability control,Fog lig

In [140]:
df['safety&security'].str.get_dummies(sep = ',')

Unnamed: 0,ABS,Adaptive Cruise Control,Adaptive headlights,Alarm system,Blind spot monitor,Central door lock,Central door lock with remote control,Daytime running lights,Driver drowsiness detection,Driver-side airbag,Electronic stability control,Emergency brake assistant,Emergency system,Fog lights,Head airbag,Immobilizer,Isofix,LED Daytime Running Lights,LED Headlights,Lane departure warning system,Night view assist,Passenger-side airbag,Power steering,Rear airbag,Side airbag,Tire pressure monitoring system,Traction control,Traffic sign recognition,Xenon headlights
0,1,0,0,0,0,1,0,1,0,1,1,0,0,1,0,1,1,0,0,0,0,1,1,0,1,1,1,0,1
1,1,0,0,0,0,1,1,1,0,1,1,0,0,0,1,1,1,0,0,0,0,1,1,0,1,1,1,0,1
2,1,0,0,0,0,1,0,1,0,1,1,0,0,0,0,1,1,0,0,0,0,1,1,0,1,1,1,0,0
3,1,0,0,1,0,0,1,0,1,1,1,0,1,0,1,1,1,0,0,0,0,1,1,0,1,1,0,0,0
4,1,0,0,0,0,1,0,0,0,1,1,0,0,1,0,1,1,0,0,0,0,1,1,0,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15914,1,0,0,0,0,1,1,0,0,1,1,1,0,0,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0
15915,1,1,0,0,1,1,0,1,0,1,1,1,0,0,0,1,1,1,1,1,0,1,1,0,1,1,1,1,0
15916,1,1,0,0,1,1,0,1,0,1,1,1,0,0,0,1,1,1,1,1,0,1,1,0,1,0,1,1,0
15917,1,0,0,0,1,0,0,0,0,1,1,1,0,0,0,0,1,0,1,0,0,1,1,0,1,1,0,1,0


In [141]:
df = df.join(df['safety&security'].str.get_dummies(sep = ',').add_prefix('ss_'))

In [142]:
df.head()

Unnamed: 0,make_model,short_description,body_type,price,vat,km,registration,hp,type,previous_owners,next_inspection,inspection_new,warranty,make,model,offer_number,first_registration,body_color,paint_type,body_color_original,upholstery,nr_of_doors,nr_of_seats,model_code,gearing_type,displacement,cylinders,drive_chain,fuel,consumption,co2_emission,emission_class,comfort&convenience,entertainment&media,extras,safety&security,description,emission_label,gears,country_version,cc_Air conditioning,cc_Air suspension,cc_Armrest,cc_Automatic climate control,cc_Auxiliary heating,cc_Cruise control,cc_Electric Starter,cc_Electric tailgate,cc_Electrical side mirrors,cc_Electrically adjustable seats,cc_Electrically heated windshield,cc_Heads-up display,cc_Heated steering wheel,cc_Hill Holder,cc_Keyless central door lock,cc_Leather seats,cc_Leather steering wheel,cc_Light sensor,cc_Lumbar support,cc_Massage seats,cc_Multi-function steering wheel,cc_Navigation system,cc_Panorama roof,cc_Park Distance Control,cc_Parking assist system camera,cc_Parking assist system self-steering,cc_Parking assist system sensors front,cc_Parking assist system sensors rear,cc_Power windows,cc_Rain sensor,cc_Seat heating,cc_Seat ventilation,cc_Split rear seats,cc_Start-stop system,cc_Sunroof,cc_Tinted windows,cc_Wind deflector,cc_Windshield,em_Bluetooth,em_CD player,em_Digital radio,em_Hands-free equipment,em_MP3,em_On-board computer,em_Radio,em_Sound system,em_Television,em_USB,ex_Alloy wheels,ex_Cab or rented Car,ex_Catalytic Converter,ex_Handicapped enabled,ex_Right hand drive,ex_Roof rack,ex_Shift paddles,ex_Ski bag,ex_Sliding door,ex_Sport package,ex_Sport seats,ex_Sport suspension,ex_Touch screen,ex_Trailer hitch,ex_Tuned car,ex_Voice Control,ex_Winter tyres,ss_ABS,ss_Adaptive Cruise Control,ss_Adaptive headlights,ss_Alarm system,ss_Blind spot monitor,ss_Central door lock,ss_Central door lock with remote control,ss_Daytime running lights,ss_Driver drowsiness detection,ss_Driver-side airbag,ss_Electronic stability control,ss_Emergency brake assistant,ss_Emergency system,ss_Fog lights,ss_Head airbag,ss_Immobilizer,ss_Isofix,ss_LED Daytime Running Lights,ss_LED Headlights,ss_Lane departure warning system,ss_Night view assist,ss_Passenger-side airbag,ss_Power steering,ss_Rear airbag,ss_Side airbag,ss_Tire pressure monitoring system,ss_Traction control,ss_Traffic sign recognition,ss_Xenon headlights
0,Audi A1,Sportback 1.4 TDI S-tronic Xenon Navi Klima,Sedans,15770,VAT deductible,56.013,01/2016,66 kW,"Used,Diesel (Particulate Filter)",2.0,2021-06-01,1,"[\n, \n, \n4 (Green)\n]",Audi,A1,[\nLR-062483\n],2016,Black,Metallic,[\nMythosschwarz\n],"Cloth, Black",[\n5\n],[\n5\n],[\n0588/BDF\n],"[\n, Automatic, \n]","[\n1,422 cc\n]",3.0,[\nfront\n],"[\n, Diesel (Particulate Filter), \n]","[[3.8 l/100 km (comb)], [4.3 l/100 km (city)], [3.5 l/100 km (country)]]",[\n99 g CO2/km (comb)\n],[\nEuro 6\n],"Air conditioning,Armrest,Automatic climate control,Cruise control,Electrical side mirrors,Hill Holder,Leather steering wheel,Light sensor,Multi-function steering wheel,Navigation system,Park Distance Control,Parking assist system sensors rear,Power windows,Rain sensor,Seat heating,Start-stop system","Bluetooth,Hands-free equipment,On-board computer,Radio","Alloy wheels,Catalytic Converter,Voice Control","ABS,Central door lock,Daytime running lights,Driver-side airbag,Electronic stability control,Fog lights,Immobilizer,Isofix,Passenger-side airbag,Power steering,Side airbag,Tire pressure monitoring system,Traction control,Xenon headlights","[\n, Sicherheit:, , Deaktivierung für Beifahrer-Airbag, , ESC mit elektronischer Quersperre, , Tagfahrlicht, , Reifendruck-Kontrollanzeige, , Kopfairbag-System mit Seiten-Airbags vorn, , Sicherheitslenksäule, Assistenzsysteme:, , Berganfahrassistent, Komfort:, , Scheinwerferreinigung, , Xenon plus inklusive Scheinwerfer-Reinigungsanlage, , Scheinwerfer-Reinigungsanlage, , Einparkhilfe hinten, , Licht-/Regensensor, , Funkfernbedienung, , Elektrische Luftzusatzheizung, Interieur:...",,,,1,0,1,1,0,1,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,1,0,1,0,0,0,1,1,1,1,0,0,1,0,0,0,0,1,0,0,1,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,1,1,0,0,1,0,1,1,0,0,0,0,1,1,0,1,1,1,0,1
1,Audi A1,1.8 TFSI sport,Sedans,14500,Price negotiable,80.0,03/2017,141 kW,"Used,Gasoline",1.0,NaT,0,,Audi,A1,,2017,Red,,,"Cloth, Grey",[\n3\n],[\n4\n],[\n0588/BCY\n],"[\n, Automatic, \n]","[\n1,798 cc\n]",4.0,[\nfront\n],"[\n, Gasoline, \n]","[[5.6 l/100 km (comb)], [7.1 l/100 km (city)], [4.7 l/100 km (country)]]",[\n129 g CO2/km (comb)\n],[\nEuro 6\n],"Air conditioning,Automatic climate control,Hill Holder,Leather steering wheel,Lumbar support,Parking assist system sensors rear,Power windows,Start-stop system,Tinted windows","Bluetooth,Hands-free equipment,On-board computer,Radio,Sound system","Alloy wheels,Sport seats,Sport suspension,Voice Control","ABS,Central door lock,Central door lock with remote control,Daytime running lights,Driver-side airbag,Electronic stability control,Head airbag,Immobilizer,Isofix,Passenger-side airbag,Power steering,Side airbag,Tire pressure monitoring system,Traction control,Xenon headlights","[\nLangstreckenfahrzeug daher die hohe Kilometerleistung. , Bremsen neu bei 62000 km. , Gut gepflegter A1\n]",4 (Green),[\n7\n],,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0,1,0,0,1,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0,0,1,1,1,0,1,1,0,0,0,1,1,1,0,0,0,0,1,1,0,1,1,1,0,1
2,Audi A1,Sportback 1.6 TDI S tronic Einparkhilfe plus+music,Sedans,14640,VAT deductible,83.45,02/2016,85 kW,"Used,Diesel (Particulate Filter)",1.0,NaT,0,"[\n, \n, \n99 g CO2/km (comb)\n]",Audi,A1,[\nAM-95365\n],2016,Black,Metallic,[\nmythosschwarz metallic\n],"Cloth, Black",[\n4\n],[\n4\n],,"[\n, Automatic, \n]","[\n1,598 cc\n]",,[\nfront\n],"[\n, Diesel (Particulate Filter), \n]","[[3.8 l/100 km (comb)], [4.4 l/100 km (city)], [3.4 l/100 km (country)]]",[\n99 g CO2/km (comb)\n],[\nEuro 6\n],"Air conditioning,Cruise control,Electrical side mirrors,Hill Holder,Leather steering wheel,Multi-function steering wheel,Navigation system,Park Distance Control,Parking assist system sensors front,Parking assist system sensors rear,Power windows,Seat heating,Start-stop system","MP3,On-board computer","Alloy wheels,Voice Control","ABS,Central door lock,Daytime running lights,Driver-side airbag,Electronic stability control,Immobilizer,Isofix,Passenger-side airbag,Power steering,Side airbag,Tire pressure monitoring system,Traction control","[\n, Fahrzeug-Nummer: AM-95365, , Ehem. UPE 24.640 EUR, , Komfort, , Klimaanlage, , Elektrische Fensterheber, , Sitzheizung, , Tempomat, , Zentralverriegelung, , Leder-Lenkrad, , Elektrische Spiegel, , Wärmedämmendes Glas, , Anfahrassistent, , Sicherheit, , Window/Kopfairbags, , ESP (el. Stabilitäts Programm), , Elektronische Wegfahrsperre, , Reifendruckkontrolle, , Tagfahrlicht, , Exterieur, , Alufelgen 16 Zoll, , Einparkhilfe vo + hi, , Interieur, , Stoff Zeitgeist, ...",4 (Green),,,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,1,0,1,0,0,1,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,1,1,0,0,0,0,1,1,0,0,0,0,1,1,0,1,1,1,0,0
3,Audi A1,1.4 TDi Design S tronic,Sedans,14500,,73.0,08/2016,66 kW,"Used,Diesel (Particulate Filter)",1.0,NaT,0,,Audi,A1,,2016,Brown,Metallic,,,[\n3\n],[\n4\n],,"[\n, Automatic, \n]","[\n1,422 cc\n]",3.0,,"[\n, Diesel (Particulate Filter), \n]","[[3.8 l/100 km (comb)], [4.3 l/100 km (city)], [3.5 l/100 km (country)]]",[\n99 g CO2/km (comb)\n],[\nEuro 6\n],"Air suspension,Armrest,Auxiliary heating,Electrical side mirrors,Heads-up display,Hill Holder,Leather steering wheel,Light sensor,Lumbar support,Multi-function steering wheel,Navigation system,Power windows,Rain sensor,Seat heating,Split rear seats,Start-stop system","Bluetooth,CD player,Hands-free equipment,MP3,On-board computer,Radio,Sound system,USB","Alloy wheels,Sport seats,Voice Control","ABS,Alarm system,Central door lock with remote control,Driver drowsiness detection,Driver-side airbag,Electronic stability control,Emergency system,Head airbag,Immobilizer,Isofix,Passenger-side airbag,Power steering,Side airbag,Tire pressure monitoring system","[\nAudi A1: , - 1e eigenaar , - Perfecte staat: schade/ongevalvrij , - Onderhoudsboekje Audi garage , - Full option!, - S-line opties, - Automaat, lederen bekleding, navi-pro, Xenon lampen, zetelverwarming, automatische airco,...\n]",,[\n6\n],,0,1,1,0,1,0,0,0,1,0,0,1,0,1,0,0,1,1,1,0,1,1,0,0,0,0,0,0,1,1,1,0,1,1,0,0,0,0,1,1,0,1,1,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,1,0,1,1,1,0,1,0,1,1,1,0,0,0,0,1,1,0,1,1,0,0,0
4,Audi A1,"Sportback 1.4 TDI S-Tronic S-Line Ext. admired, Xe",Sedans,16790,,16.2,05/2016,66 kW,"Used,Diesel (Particulate Filter)",1.0,NaT,1,"[\n, \n, \nEuro 6\n]",Audi,A1,[\nC1626\n],2016,Black,Metallic,[\nMythosschwarz Metallic\n],"Cloth, Black",[\n5\n],[\n5\n],[\n0588/BDF\n],"[\n, Automatic, \n]","[\n1,422 cc\n]",3.0,[\nfront\n],"[\n, Diesel (Particulate Filter), \n]","[[4.1 l/100 km (comb)], [4.6 l/100 km (city)], [3.8 l/100 km (country)]]",[\n109 g CO2/km (comb)\n],[\nEuro 6\n],"Air conditioning,Armrest,Automatic climate control,Electrical side mirrors,Hill Holder,Leather steering wheel,Light sensor,Multi-function steering wheel,Park Distance Control,Parking assist system sensors rear,Power windows,Rain sensor,Start-stop system","Bluetooth,CD player,Hands-free equipment,MP3,On-board computer,Radio,USB","Alloy wheels,Sport package,Sport suspension,Voice Control","ABS,Central door lock,Driver-side airbag,Electronic stability control,Fog lights,Immobilizer,Isofix,Passenger-side airbag,Power steering,Side airbag,Tire pressure monitoring system,Traction control,Xenon headlights","[\n, Technik & Sicherheit:, Xenon plus, Klimaautomatik, Einparkhilfe hinten, Scheinwerferreinigungsanlage, Nebelscheinwerfer, Zentralverriegelung mit Fernbedienung, Außenspiegel elek. einstellbar, Fensterheber elek, Start/Stop Anlage, 6 Airbag inkl. Kopfairbag, Euro 6, Multimedia:, Radio CD Concert, audi music interface, Handyvorbereitung (Bluetooth), Sprachsteuerung, USB Anschluss, Assistenzsysteme:, Lichtsensor, Regensensor, Berganfahrassistent, Reifendruckkontrolle, ESP inkl. ABS, Interie...",,,Germany,1,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,0,1,1,1,0,0,0,1,0,0,0,0,1,1,0,1,1,1,1,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,1,0,0,1,0,1,1,0,0,0,0,1,1,0,1,1,1,0,1


In [143]:
df.shape

(15919, 134)

In [144]:
df.consumption.head()

0    [[3.8 l/100 km (comb)], [4.3 l/100 km (city)], [3.5 l/100 km (country)]]
1    [[5.6 l/100 km (comb)], [7.1 l/100 km (city)], [4.7 l/100 km (country)]]
2    [[3.8 l/100 km (comb)], [4.4 l/100 km (city)], [3.4 l/100 km (country)]]
3    [[3.8 l/100 km (comb)], [4.3 l/100 km (city)], [3.5 l/100 km (country)]]
4    [[4.1 l/100 km (comb)], [4.6 l/100 km (city)], [3.8 l/100 km (country)]]
Name: consumption, dtype: object

In [145]:
df['consumption'].isnull().sum()/df.shape[0]

0.11973113889063383

In [146]:
df.consumption.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                                                                                   1906
[[3.9 l/100 km (comb)], [4.1 l/100 km (city)], [3.7 l/100 km (country)]]               304
[[4.2 l/100 km (comb)], [5 l/100 km (city)], [3.7 l/100 km (country)]]                 276
[[5.4 l/100 km (comb)], [6.8 l/100 km (city)], [4.5 l/100 km (country)]]               257
[[3.8 l/100 km (comb)], [4.3 l/100 km (city)], [3.5 l/100 km (country)]]               253
                                                                                      ... 
[[3.6 l/100 km (comb)], [], [4.4 l/100 km (country)]]                                    1
[\n, 4.8 l/100 km (comb), \n, 5.6 l/100 km (city), \n, 4.3 l/100 km (country), \n]       1
[[7.6 l/100 km (comb)], [], []]                                                          1
[[5.6 l/100 km (comb)], [7.6 l/100 km (city)], [4.4 l/100 km (country)]]                 1
[\n, 4.7 l/100 km (comb), \n, \n, \n]                                                    1

In [147]:
df.consumption.sample(55)

11187              [[4.3 l/100 km (comb)], [5.5 l/100 km (city)], [3.6 l/100 km (country)]]
2156               [[4.8 l/100 km (comb)], [5.8 l/100 km (city)], [4.2 l/100 km (country)]]
815                  [[4.2 l/100 km (comb)], [5 l/100 km (city)], [3.7 l/100 km (country)]]
5211               [[3.9 l/100 km (comb)], [4.1 l/100 km (city)], [3.7 l/100 km (country)]]
2479                 [[4.9 l/100 km (comb)], [6 l/100 km (city)], [4.2 l/100 km (country)]]
1350               [[3.7 l/100 km (comb)], [4.2 l/100 km (city)], [3.4 l/100 km (country)]]
13393              [[3.5 l/100 km (comb)], [3.9 l/100 km (city)], [3.3 l/100 km (country)]]
9957               [[6.2 l/100 km (comb)], [7.9 l/100 km (city)], [5.3 l/100 km (country)]]
2798               [[3.9 l/100 km (comb)], [4.1 l/100 km (city)], [3.7 l/100 km (country)]]
8531                 [[4.7 l/100 km (comb)], [5.8 l/100 km (city)], [4 l/100 km (country)]]
4522               [[3.8 l/100 km (comb)], [4.3 l/100 km (city)], [3.5 l/100 km 

In [148]:
df.consumption.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                                                                                   1906
[[3.9 l/100 km (comb)], [4.1 l/100 km (city)], [3.7 l/100 km (country)]]               304
[[4.2 l/100 km (comb)], [5 l/100 km (city)], [3.7 l/100 km (country)]]                 276
[[5.4 l/100 km (comb)], [6.8 l/100 km (city)], [4.5 l/100 km (country)]]               257
[[3.8 l/100 km (comb)], [4.3 l/100 km (city)], [3.5 l/100 km (country)]]               253
                                                                                      ... 
[[3.6 l/100 km (comb)], [], [4.4 l/100 km (country)]]                                    1
[\n, 4.8 l/100 km (comb), \n, 5.6 l/100 km (city), \n, 4.3 l/100 km (country), \n]       1
[[7.6 l/100 km (comb)], [], []]                                                          1
[[5.6 l/100 km (comb)], [7.6 l/100 km (city)], [4.4 l/100 km (country)]]                 1
[\n, 4.7 l/100 km (comb), \n, \n, \n]                                                    1

In [149]:
df.consumption.str[0].str[0].str.extract("(\d.\d|\d)")[0].unique()

array(['3.8', '5.6', '4.1', '3.5', '3.7', nan, '4', '4.9', '4.2', '4.5',
       '4.4', '4.3', '3', '3.6', '3.4', '3.9', '5.1', '5.2', '4.6', '4.8',
       '5.8', '4.7', '5', '1', '6', '5.9', '5.3', '5.5', '3.3', '3.2',
       '6.6', '8.3', '6.5', '7.1', '8.1', '5.4', '6.4', '6.7', '6.2',
       '7.3', '6.3', '5.7', '6.1', '6.8', '7.5', '7.4', '0', '7.8', '3.1',
       '6.9', '7', '7.2', '8', '9.1', '8.6', '8.7', '7.9', '1.2', '7.6'],
      dtype=object)

In [150]:
df.consumption.str[0].str[0].str.extract("(^\d+.\d*)")[0].unique()

array(['3.8', '5.6', '4.1', '3.5', '3.7', nan, '4 ', '4.9', '4.2', '4.5',
       '4.4', '4.3', '3 ', '3.6', '3.4', '3.9', '5.1', '5.2', '4.6',
       '4.8', '5.8', '4.7', '5 ', '43 ', '13.8', '6 ', '5.9', '51 ',
       '5.3', '5.5', '10 ', '3.3', '3.2', '6.6', '8.3', '6.5', '7.1',
       '8.1', '5.4', '40 ', '38 ', '6.4', '6.7', '6.2', '7.3', '6.3',
       '5.7', '6.1', '6.8', '7.5', '7.4', '0 ', '7.8', '3.1', '6.9', '7 ',
       '7.2', '8 ', '11 ', '9.1', '8.6', '8.7', '7.9', '55 ', '54 ',
       '1.2', '32 ', '33 ', '50 ', '1 ', '46 ', '7.6'], dtype=object)

In [151]:
df['consumption_comb'] = df.consumption.str[0].str[0].str.extract("(\d.\d|\d)")

In [152]:
df.consumption_comb.value_counts(dropna = False)

NaN    2883
3.9     732
4       716
5.4     663
5.1     630
4.4     595
3.8     585
5.6     568
4.7     555
4.8     523
5       520
4.5     511
5.2     429
4.6     420
4.2     408
5.3     375
3.7     369
4.9     348
5.5     342
4.1     341
5.9     308
3.3     307
5.7     302
4.3     295
3.5     288
6       278
3.6     194
6.2     184
5.8     142
6.3     141
6.1     135
6.8     134
6.6     119
3.4     106
3        73
6.4      64
7.4      62
7.1      38
1        37
6.5      36
6.7      33
3.2      25
6.9      21
8.3      20
7.6      14
7         9
3.1       7
7.2       6
7.8       6
8         5
8.7       3
8.6       3
7.3       2
8.1       2
0         2
7.9       2
1.2       1
9.1       1
7.5       1
Name: consumption_comb, dtype: int64

In [153]:
df.consumption.str[1].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                       1906
[5 l/100 km (city)]        642
[5.8 l/100 km (city)]      597
[4.5 l/100 km (city)]      577
[]                         516
                          ... 
[10.4 l/100 km (city)]       1
[4.4 kg/100 km (city)]       1
[66 l/100 km (city)]         1
7 l/100 km (comb)            1
[6.5 kg/100 km (city)]       1
Name: consumption, Length: 130, dtype: int64

In [154]:
df.consumption.str[1].str.extract("(\d.\d|\d)")[0].value_counts(dropna = False)

NaN    15069
5.4      107
5.9       61
6         53
5.6       50
4.7       47
4.9       45
5.7       40
6.1       40
6.3       40
5.5       38
6.2       32
5         32
6.6       29
4.4       28
5.1       27
4.2       27
5.2       25
4.8       23
5.8       22
4.5       12
4.3       12
6.4       11
6.7       10
6.5        7
4.6        6
6.9        6
5.3        5
7.4        4
1.6        3
6.8        2
4          1
3.9        1
4.1        1
7.9        1
7          1
8.6        1
Name: 0, dtype: int64

In [155]:
consumption1 = df.consumption.str[1].str.extract("(\d.\d|\d)")

In [156]:
df.consumption_comb.fillna(consumption1[0], inplace = True)

In [157]:
df.consumption.str[-2].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                        1906
[5 l/100 km (city)]         642
[5.8 l/100 km (city)]       597
[4.5 l/100 km (city)]       577
[]                          516
                           ... 
[10.4 l/100 km (city)]        1
[10.5 kg/100 km (city)]       1
[6.8 kg/100 km (city)]        1
6.1 l/100 km (country)        1
[9 l/100 km (city)]           1
Name: consumption, Length: 121, dtype: int64

In [158]:
df.consumption.str[-2].str[0].value_counts(dropna = False)

NaN                     2422
5 l/100 km (city)        642
5.8 l/100 km (city)      597
4.5 l/100 km (city)      577
4                        532
                        ... 
6.5 kg/100 km (city)       1
7                          1
9 l/100 km (city)          1
4.4 kg/100 km (city)       1
64 l/100 km (city)         1
Name: consumption, Length: 99, dtype: int64

In [159]:
df.consumption.str[-2].str[0].unique()

array(['4.3 l/100 km (city)', '7.1 l/100 km (city)',
       '4.4 l/100 km (city)', '4.6 l/100 km (city)',
       '4.2 l/100 km (city)', nan, '6.2 l/100 km (city)',
       '5 l/100 km (city)', '5.5 l/100 km (city)', '5.4 l/100 km (city)',
       '3.8 l/100 km (city)', '5.2 l/100 km (city)', '3 l/100 km (city)',
       '4 l/100 km (city)', '3.9 l/100 km (city)', '4.1 l/100 km (city)',
       '3', '4.7 l/100 km (city)', '6.3 l/100 km (city)',
       '6.7 l/100 km (city)', '5.6 l/100 km (city)',
       '4.8 l/100 km (city)', '6.6 l/100 km (city)',
       '4.5 l/100 km (city)', '5.8 l/100 km (city)', '4',
       '7.4 l/100 km (city)', '6.1 l/100 km (city)',
       '3.7 l/100 km (city)', '5.1 l/100 km (city)',
       '19.9 l/100 km (city)', '6.4 l/100 km (city)',
       '8.2 l/100 km (city)', '6 l/100 km (city)', '6.5 l/100 km (city)',
       '5.9 l/100 km (city)', '8 l/100 km (city)', '62 l/100 km (city)',
       '6.8 l/100 km (city)', '7.5 l/100 km (city)',
       '3.5 l/100 km (city)', '5

In [160]:
df.consumption.str[-2].str[0].str.extract("(\d.\d|\d)")[0].value_counts(dropna = False)

NaN    2436
5       858
4       836
5.8     600
4.5     580
       ... 
8.8       2
3.6       2
9.5       2
9.7       1
9         1
Name: 0, Length: 70, dtype: int64

In [161]:
consumption_city = df.consumption.str[-2].str[0].str.extract("(\d.\d|\d)")

In [162]:
consumption_city = pd.to_numeric(consumption_city[0])

In [163]:
consumption_country = df.consumption.str[-1].str[0].str.extract("(\d.\d|\d)")

In [164]:
consumption_country[0].value_counts(dropna = False)

NaN    3212
4.2    1101
3.7    1059
4.4     829
4.5     789
3.8     775
3.9     759
4.1     569
4.7     562
4       560
3.5     525
4.3     496
3.6     486
3.1     481
3.3     456
4.6     446
4.9     410
3.4     353
4.8     289
5.3     227
5.1     215
5.7     191
5.4     190
3.2     180
3       146
5.6     131
5       106
5.2      84
6.3      49
6        35
1        34
5.5      30
5.8      30
7.7      20
6.6      19
2.9      16
6.4      15
2.8       9
0         9
6.5       4
7.3       4
7.1       3
7         2
5.9       2
6.9       2
7.8       2
6.7       2
7.6       1
2         1
8.6       1
8         1
6.1       1
Name: 0, dtype: int64

In [165]:
consumption_country = pd.to_numeric(consumption_country[0])

In [166]:
consumption_country

0        3.5
1        4.7
2        3.4
3        3.5
4        3.8
        ... 
15914    4.7
15915    NaN
15916    4.7
15917    4.7
15918    5.7
Name: 0, Length: 15919, dtype: float64

In [167]:
consumption_city

0        4.3
1        7.1
2        4.4
3        4.3
4        4.6
        ... 
15914    6.2
15915    6.0
15916    6.2
15917    6.2
15918    8.7
Name: 0, Length: 15919, dtype: float64

In [168]:
consumption2 = (consumption_country + consumption_city)/2

In [169]:
consumption2.value_counts(dropna = False)

NaN     3283
3.90     610
5.65     498
4.75     497
4.10     439
        ... 
7.70       1
7.25       1
7.45       1
7.20       1
7.35       1
Name: 0, Length: 139, dtype: int64

In [170]:
df.consumption_comb.fillna(consumption2).value_counts(dropna = False)

NaN     1925
5.4      770
3.9      733
4        717
5.1      657
        ... 
4.5        1
7.5        1
5.8        1
4.55       1
9.1        1
Name: consumption_comb, Length: 70, dtype: int64

In [171]:
df.consumption_comb.fillna(consumption2, inplace = True)

In [172]:
df.consumption_comb.value_counts(dropna = False)

NaN     1925
5.4      770
3.9      733
4        717
5.1      657
        ... 
4.5        1
7.5        1
5.8        1
4.55       1
9.1        1
Name: consumption_comb, Length: 70, dtype: int64

In [173]:
df.gearing_type.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n, Manual, \n]            8153
[\n, Automatic, \n]         7297
[\n, Semi-automatic, \n]     469
Name: gearing_type, dtype: int64

In [174]:
df.gearing_type.str[1].value_counts(dropna = False)

Manual            8153
Automatic         7297
Semi-automatic     469
Name: gearing_type, dtype: int64

In [175]:
df.gearing_type = df.gearing_type.str[1]

In [176]:
df.gearing_type.value_counts(dropna = False)

Manual            8153
Automatic         7297
Semi-automatic     469
Name: gearing_type, dtype: int64

In [177]:
df.drive_chain.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\nfront\n]    8886
NaN            6858
[\n4WD\n]       171
[\nrear\n]        4
Name: drive_chain, dtype: int64

In [178]:
df.drive_chain.str[0].str.strip().str.strip('\n').value_counts(dropna = False)

front    8886
NaN      6858
4WD       171
rear        4
Name: drive_chain, dtype: int64

In [179]:
df.drive_chain = df.drive_chain.str[0].str.strip().str.strip('\n')

In [180]:
df.drive_chain.value_counts(dropna = False)

front    8886
NaN      6858
4WD       171
rear        4
Name: drive_chain, dtype: int64

In [181]:
df.model_code.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN               10941
[\n0035/BCB\n]      268
[\n0588/BNO\n]      245
[\n0588/BDB\n]      206
[\n0588/BHX\n]      188
                  ...  
[\n1844/AEN\n]        1
[\n0588/AVR\n]        1
[\n0035/AFF\n]        1
[\n1844/AFM\n]        1
[\n0035/AVN\n]        1
Name: model_code, Length: 233, dtype: int64

In [182]:
df.model_code.isnull().sum()/df.shape[0]

0.6872919153213142

In [183]:
df.drop(columns='model_code', inplace = True)

In [184]:
df.body_color.str.contains(' ').sum()

0

In [185]:
df.body_color.isnull().sum()/df.shape[0]

0.037502355675607765

In [186]:
df.body_color.fillna(method='bfill').value_counts(dropna = False)

Black     3888
Grey      3638
White     3540
Silver    1687
Blue      1524
Red        989
Brown      299
Green      163
Beige      108
Yellow      53
Violet      18
Bronze       7
Orange       3
Gold         2
Name: body_color, dtype: int64

In [187]:
df.body_color.fillna(method='bfill', inplace = True)

In [188]:
df.body_color_original.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                             3759
[\nOnyx Schwarz\n]               338
[\nBianco\n]                     282
[\nMythosschwarz Metallic\n]     238
[\nBrillantschwarz\n]            216
                                ... 
[\nBianca - Tetto Nero\n]          1
[\ndezir rot\n]                    1
[\nnero/tetto argento met\n]       1
[\nPython Yellow Metallic\n]       1
[\nkarbongrau\n]                   1
Name: body_color_original, Length: 1928, dtype: int64

In [189]:
df.body_color_original.isnull().sum()/df.shape[0]

0.23613292292229413

In [190]:
df.drop(columns='body_color_original', inplace = True)

In [191]:
df.co2_emission.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                            1808
[\n120 g CO2/km (comb)\n]       740
[[], [], []]                    628
[\n99 g CO2/km (comb)\n]        545
[\n97 g CO2/km (comb)\n]        537
                               ... 
[\n193 g CO2/km (comb)\n]         1
[\n171 g CO2/km (comb)\n]         1
[\n990 g CO2/km (comb)\n]         1
[\n1,060 g CO2/km (comb)\n]       1
[\n183 g CO2/km (comb)\n]         1
Name: co2_emission, Length: 124, dtype: int64

In [192]:
df.co2_emission.isnull().sum()/df.shape[0]

0.11357497330234312

In [193]:
pattern = '(\d\d\d\d\d|\d\d\d\d|\d\d\d|\d\d|\d)'

In [194]:
df['co2_emission_new'] = df.co2_emission.str[0].str.strip('\n').str.replace(',','').str.extract(pattern)

In [195]:
df.co2_emission_new.value_counts(dropna = False)

NaN    2436
120     740
99      545
97      537
104     501
       ... 
14        1
181       1
239       1
177       1
1         1
Name: co2_emission_new, Length: 123, dtype: int64

In [196]:
df.co2_emission_new.unique()

array(['99', '129', '109', '92', '98', '97', nan, '105', '112', '103',
       '102', '95', '104', '91', '94', '117', '123', '106', '108', '121',
       '107', '101', '113', '137', '100', '116', '114', '118', '331',
       '115', '119', '90', '136', '134', '110', '111', '120', '89', '142',
       '126', '122', '128', '127', '138', '130', '125', '85', '124',
       '152', '88', '189', '194', '149', '153', '188', '36', '1060', '96',
       '990', '146', '135', '158', '12087', '141', '172', '154', '150',
       '167', '174', '93', '133', '131', '145', '147', '156', '87', '5',
       '148', '139', '151', '144', '168', '160', '170', '80', '132',
       '155', '14', '159', '0', '143', '140', '82', '12324', '84', '165',
       '51', '157', '169', '166', '253', '164', '175', '190', '161',
       '239', '197', '184', '14457', '199', '13983', '187', '181', '186',
       '177', '180', '162', '45', '171', '183', '1', '191', '193'],
      dtype=object)

In [197]:
df.co2_emission_new.median()

116.0

In [198]:
df.co2_emission_new.fillna(df.co2_emission_new.median(), inplace = True)

In [199]:
df.co2_emission_new.value_counts(dropna = False)

116.0    2436
120       740
99        545
97        537
104       501
         ... 
180         1
14          1
1060        1
14457       1
184         1
Name: co2_emission_new, Length: 123, dtype: int64

In [200]:
df.displacement.isnull().sum()/df.shape[0]

0.0311577360386959

In [201]:
df.displacement.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n1,598 cc\n]     4761
[\n999 cc\n]       2438
[\n1,398 cc\n]     1314
[\n1,399 cc\n]      749
[\n1,229 cc\n]      677
                   ... 
[\n1,800 cc\n]        1
[\n140 cc\n]          1
[\n15,898 cc\n]       1
[\n1,686 cc\n]        1
[\n1,368 cc\n]        1
Name: displacement, Length: 78, dtype: int64

In [202]:
df['displacement_cc'] = df.displacement.str[0].str.strip('\n').str.replace('cc', '').str.replace(',','').str.strip()

In [203]:
df.displacement_cc = pd.to_numeric(df.displacement_cc)

In [205]:
df.displacement_cc

0        1422.0
1        1798.0
2        1598.0
3        1422.0
4        1422.0
          ...  
15914    1997.0
15915    1798.0
15916    1997.0
15917    1997.0
15918    1798.0
Name: displacement_cc, Length: 15919, dtype: float64

In [207]:
df.emission_class.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\nEuro 6\n]          10139
NaN                    3021
[\nEuro 6d-TEMP\n]     1845
[[], [], []]            607
[\nEuro 6c\n]           127
[\nEuro 5\n]             78
[\nEuro 6d\n]            62
[\nEuro 4\n]             40
Name: emission_class, dtype: int64

In [208]:
df.emission_class.str[0].str.strip('\n').value_counts(dropna = False)

Euro 6          10139
NaN              3628
Euro 6d-TEMP     1845
Euro 6c           127
Euro 5             78
Euro 6d            62
Euro 4             40
Name: emission_class, dtype: int64

In [209]:
df['emission_class_new'] = df.emission_class.str[0].str.strip('\n')

In [210]:
df.replace({'emission_class_new': {'Euro 6d-TEMP': 'Euro 6', 'Euro 6c': 'Euro 6', 'Euro 6d': 'Euro 6'}}, regex = True, inplace = True)

In [211]:
df.emission_class_new.value_counts(dropna = False)

Euro 6    12173
NaN        3628
Euro 5       78
Euro 4       40
Name: emission_class_new, dtype: int64

In [29]:
df["first_registration"] = df["first_registration"].apply(model_extractor)

In [212]:
df.first_registration.isnull().sum()/df.shape[0]

0.10032037188265594

In [216]:
df.first_registration = pd.to_numeric(df.first_registration)

In [217]:
df.first_registration

0        2016.0
1        2017.0
2        2016.0
3        2016.0
4        2016.0
          ...  
15914       NaN
15915    2019.0
15916    2019.0
15917    2019.0
15918    2019.0
Name: first_registration, Length: 15919, dtype: float64

In [219]:
df['register_age'] = 2020 - df.first_registration

In [220]:
df.register_age.value_counts(dropna = False)

2.0    4522
4.0    3674
3.0    3273
1.0    2853
NaN    1597
Name: first_registration, dtype: int64

In [222]:
df.fuel.isnull().sum()/df.shape[0]

0.0

In [223]:
df.fuel.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n, Diesel (Particulate Filter), \n]                                                                                  4315
[\n, Super 95, \n]                                                                                                     3338
[\n, Gasoline, \n]                                                                                                     3175
[\n, Diesel, \n]                                                                                                       2984
[\n, Super 95 / Regular/Benzine 91, \n]                                                                                 424
                                                                                                                       ... 
[\n, Super Plus 98 / Super E10 95, \n]                                                                                    1
[\n, Regular/Benzine 91 / Super 95 / Regular/Benzine E10 91 / Super E10 95 / Super Plus 98 / Super Plus E10 98, \n]       1
[\n, Sup