In [47]:
# import libraries
from os import listdir
from os.path import isfile, join
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.keras as keras
from sklearn.preprocessing import OneHotEncoder, LabelEncoder


In [21]:
# get dataset files

cars_listing_1_dir = '../datasets/car-listing-1'
cars_listing_2_dir = '../datasets/car-listing-2'

def get_csvs_in_dir(dir_path):
    return [join(dir_path, f) for f in listdir(dir_path) if isfile(join(dir_path, f)) and f.endswith('.csv')]


cars_1_files = get_csvs_in_dir(cars_listing_1_dir)
cars_2_files = get_csvs_in_dir(cars_listing_2_dir)


In [22]:
# peek at cars 1 datasets
df_audi = pd.read_csv(cars_1_files[0])
print(df_audi.head())

df_bmw = pd.read_csv(cars_1_files[1])
print(df_bmw.head())

df_ford = pd.read_csv(cars_1_files[2])
print(df_ford.head())

df_hyundi = pd.read_csv(cars_1_files[3])
print(df_hyundi.head())

df_merc = pd.read_csv(cars_1_files[4])
print(df_merc.head())

df_toyota = pd.read_csv(cars_1_files[5])
print(df_toyota.head())

       model  year  price transmission  mileage fuelType  tax   mpg  \
0   5 Series  2014  11200    Automatic    67068   Diesel  125  57.6   
1   6 Series  2018  27000    Automatic    14827   Petrol  145  42.8   
2   5 Series  2016  16000    Automatic    62794   Diesel  160  51.4   
3   1 Series  2017  12750    Automatic    26676   Diesel  145  72.4   
4   7 Series  2014  14500    Automatic    39554   Diesel  160  50.4   

   engineSize  
0         2.0  
1         2.0  
2         3.0  
3         1.5  
4         3.0  
     model  year  price transmission  mileage fuelType  tax   mpg  engineSize
0      I20  2017   7999       Manual    17307   Petrol  145  58.9         1.2
1   Tucson  2016  14499    Automatic    25233   Diesel  235  43.5         2.0
2   Tucson  2016  11399       Manual    37877   Diesel   30  61.7         1.7
3      I10  2016   6499       Manual    23789   Petrol   20  60.1         1.0
4     IX35  2015  10199       Manual    33177   Diesel  160  51.4         2.0
  model  

In [51]:
# add column for brand of car
df_audi['brand']   = 'audi'
df_bmw['brand']    = 'bmw'
df_ford['brand']   = 'ford'
df_hyundi['brand'] = 'hyundi'
df_merc['brand']   = 'merc'
df_toyota['brand'] = 'toyota'

# concatenate all dataframes together
df_cars_1 = pd.concat([df_audi, df_bmw, df_ford, df_hyundi, df_merc, df_toyota])

# change column order to something that allows us to split it easier later on
df_cars_1 = df_cars_1[['brand', 'model', 'transmission', 'fuelType','year', 'mileage', 'tax', 'mpg', 'engineSize', 'price']]
cars_1_y = df_cars_1.pop('price').to_numpy()
cars_1_X = df_cars_1.to_numpy()


In [58]:
# temporarily separate categorical cols from numerical
num_cols = cars_1_X[:,4:]
cat_cols = cars_1_X[:, :4]

# One-Hot Encode string values
enc = OneHotEncoder(sparse=False)
cat_cols_enc = enc.fit_transform(cat_cols)

cars_1_X_enc = np.hstack((cat_cols_enc, num_cols))
print(cars_1_X_enc.shape)
print(cars_1_X_enc[:10])


(64131, 154)
[[1.0 0.0 0.0 ... 125 57.6 2.0]
 [1.0 0.0 0.0 ... 145 42.8 2.0]
 [1.0 0.0 0.0 ... 160 51.4 3.0]
 ...
 [1.0 0.0 0.0 ... 145 52.3 1.5]
 [1.0 0.0 0.0 ... 30 62.8 2.0]
 [1.0 0.0 0.0 ... 20 68.9 2.0]]
