In [1]:
import pandas as pd
from sklearn.feature_extraction import FeatureHasher

In [2]:
df = pd.read_csv('carsnew.csv', low_memory=False)
df.head()

Unnamed: 0,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,...,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,duration_listed
0,Subaru,Outback,automatic,silver,190000,2010,gasoline,False,gasoline,2.5,...,True,True,True,False,True,False,True,True,True,16
1,Subaru,Outback,automatic,blue,290000,2002,gasoline,False,gasoline,3.0,...,True,False,False,True,True,False,False,False,True,83
2,Subaru,Forester,automatic,red,402000,2001,gasoline,False,gasoline,2.5,...,True,False,False,False,False,False,False,True,True,151
3,Subaru,Impreza,mechanical,blue,10000,1999,gasoline,False,gasoline,3.0,...,False,False,False,False,False,False,False,False,False,86
4,Subaru,Legacy,automatic,black,280000,2001,gasoline,False,gasoline,2.5,...,True,False,True,True,False,False,False,False,True,7


In [3]:
for col in df.columns:
    print(col, df[col].nunique())

manufacturer_name 55
model_name 1118
transmission 2
color 12
odometer_value 6063
year_produced 64
engine_fuel 6
engine_has_gas 2
engine_type 3
engine_capacity 61
body_type 12
has_warranty 2
state 3
drivetrain 3
price_usd 2677
is_exchangeable 2
location_region 6
number_of_photos 61
up_counter 384
feature_0 2
feature_1 2
feature_2 2
feature_3 2
feature_4 2
feature_5 2
feature_6 2
feature_7 2
feature_8 2
feature_9 2
duration_listed 786


Of the features that deal with categorical data, there are two that have far more unique values than the rest are
manufacturer_name and model_name. The rest of the columns we can either one hot if there are more than two unique
values or simply replace the values with 0 or 1 if there are only 2 unique values.

In [4]:
#These columns all only have two unique values so we can replace their values with 0 and 1
binary_cols = ['engine_has_gas', 'transmission', 'has_warranty', 'is_exchangeable']
for i in range(10):
    binary_cols.append(f'feature_{i}')
print(binary_cols)

['engine_has_gas', 'transmission', 'has_warranty', 'is_exchangeable', 'feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9']


In [5]:
sparse_cols = ['color','engine_fuel','engine_type','body_type','state','drivetrain','location_region']

In [6]:
hasher = FeatureHasher(n_features=10, input_type='string')
model_t = hasher.transform(df['model_name'])
manu_t = hasher.transform(df['manufacturer_name'])
manu_t

<38531x10 sparse matrix of type '<class 'numpy.float64'>'
	with 163106 stored elements in Compressed Sparse Row format>

In [7]:
d = pd.get_dummies(df, columns=sparse_cols)
d.head()

Unnamed: 0,manufacturer_name,model_name,transmission,odometer_value,year_produced,engine_has_gas,engine_capacity,has_warranty,price_usd,is_exchangeable,...,state_owned,drivetrain_all,drivetrain_front,drivetrain_rear,location_region_Брестская обл.,location_region_Витебская обл.,location_region_Гомельская обл.,location_region_Гродненская обл.,location_region_Минская обл.,location_region_Могилевская обл.
0,Subaru,Outback,automatic,190000,2010,False,2.5,False,10900.0,False,...,1,1,0,0,0,0,0,0,1,0
1,Subaru,Outback,automatic,290000,2002,False,3.0,False,5000.0,True,...,1,1,0,0,0,0,0,0,1,0
2,Subaru,Forester,automatic,402000,2001,False,2.5,False,2800.0,True,...,1,1,0,0,0,0,0,0,1,0
3,Subaru,Impreza,mechanical,10000,1999,False,3.0,False,9999.0,True,...,1,1,0,0,0,0,0,0,1,0
4,Subaru,Legacy,automatic,280000,2001,False,2.5,False,2134.11,True,...,1,1,0,0,0,0,1,0,0,0


In [8]:
for b in binary_cols:
    unique = d[b].unique()
    d[b] = d[b].replace([unique[0],unique[1]], [0, 1])
len(d)

38531

In [9]:
hasher = FeatureHasher(n_features=10, input_type='string')
model_t = hasher.transform(d['model_name'])
model_df = pd.DataFrame(model_t.toarray(), columns=[f"md_{i}" for i in range(10)])
n_df = pd.concat([d,model_df], axis=1)
manu_t = hasher.transform(d['manufacturer_name'])
manu_df = pd.DataFrame(manu_t.toarray(), columns=[f"mn_{i}" for i in range(10)])
new_df = pd.concat([n_df,manu_df], axis=1)
new_df.head()

Unnamed: 0,manufacturer_name,model_name,transmission,odometer_value,year_produced,engine_has_gas,engine_capacity,has_warranty,price_usd,is_exchangeable,...,mn_0,mn_1,mn_2,mn_3,mn_4,mn_5,mn_6,mn_7,mn_8,mn_9
0,Subaru,Outback,0,190000,2010,0,2.5,0,10900.0,0,...,1.0,-1.0,2.0,0.0,0.0,1.0,0.0,0.0,-1.0,0.0
1,Subaru,Outback,0,290000,2002,0,3.0,0,5000.0,1,...,1.0,-1.0,2.0,0.0,0.0,1.0,0.0,0.0,-1.0,0.0
2,Subaru,Forester,0,402000,2001,0,2.5,0,2800.0,1,...,1.0,-1.0,2.0,0.0,0.0,1.0,0.0,0.0,-1.0,0.0
3,Subaru,Impreza,1,10000,1999,0,3.0,0,9999.0,1,...,1.0,-1.0,2.0,0.0,0.0,1.0,0.0,0.0,-1.0,0.0
4,Subaru,Legacy,0,280000,2001,0,2.5,0,2134.11,1,...,1.0,-1.0,2.0,0.0,0.0,1.0,0.0,0.0,-1.0,0.0


In [10]:
new_df.drop('manufacturer_name', inplace=True,axis=1)
new_df.head()

Unnamed: 0,model_name,transmission,odometer_value,year_produced,engine_has_gas,engine_capacity,has_warranty,price_usd,is_exchangeable,number_of_photos,...,mn_0,mn_1,mn_2,mn_3,mn_4,mn_5,mn_6,mn_7,mn_8,mn_9
0,Outback,0,190000,2010,0,2.5,0,10900.0,0,9,...,1.0,-1.0,2.0,0.0,0.0,1.0,0.0,0.0,-1.0,0.0
1,Outback,0,290000,2002,0,3.0,0,5000.0,1,12,...,1.0,-1.0,2.0,0.0,0.0,1.0,0.0,0.0,-1.0,0.0
2,Forester,0,402000,2001,0,2.5,0,2800.0,1,4,...,1.0,-1.0,2.0,0.0,0.0,1.0,0.0,0.0,-1.0,0.0
3,Impreza,1,10000,1999,0,3.0,0,9999.0,1,9,...,1.0,-1.0,2.0,0.0,0.0,1.0,0.0,0.0,-1.0,0.0
4,Legacy,0,280000,2001,0,2.5,0,2134.11,1,14,...,1.0,-1.0,2.0,0.0,0.0,1.0,0.0,0.0,-1.0,0.0


In [11]:
new_df.drop('model_name', inplace=True,axis=1)
new_df.head()

Unnamed: 0,transmission,odometer_value,year_produced,engine_has_gas,engine_capacity,has_warranty,price_usd,is_exchangeable,number_of_photos,up_counter,...,mn_0,mn_1,mn_2,mn_3,mn_4,mn_5,mn_6,mn_7,mn_8,mn_9
0,0,190000,2010,0,2.5,0,10900.0,0,9,13,...,1.0,-1.0,2.0,0.0,0.0,1.0,0.0,0.0,-1.0,0.0
1,0,290000,2002,0,3.0,0,5000.0,1,12,54,...,1.0,-1.0,2.0,0.0,0.0,1.0,0.0,0.0,-1.0,0.0
2,0,402000,2001,0,2.5,0,2800.0,1,4,72,...,1.0,-1.0,2.0,0.0,0.0,1.0,0.0,0.0,-1.0,0.0
3,1,10000,1999,0,3.0,0,9999.0,1,9,42,...,1.0,-1.0,2.0,0.0,0.0,1.0,0.0,0.0,-1.0,0.0
4,0,280000,2001,0,2.5,0,2134.11,1,14,7,...,1.0,-1.0,2.0,0.0,0.0,1.0,0.0,0.0,-1.0,0.0
