In [302]:
import os
import csv
import pyarrow.parquet as pq

import pandas as pd
import numpy as np

In [303]:

# Define path to data
path_data = "../data/Scraped data/May 2023"

# empty list to store data
data_list = []

# loop through files in directory
for file in os.listdir(path_data):
    # check if file is in list of files to load
    if ".csv" in file:
        # open file and read data
        with open(os.path.join(path_data, file), newline='') as f:
            # create csv reader object
            reader = csv.reader(f)
            # iterate over rows in the csv file and add filename to each row
            reader_data = [row + [file.split(".")[0]] for row in reader]
            # store column names and data in data_list
            col_names = reader_data[0]
            data_list.append(reader_data[1:])

# concatenate data from all files into one dataframe
data = pd.concat([pd.DataFrame(data) for data in data_list]).reset_index(drop=True)

# Drop the first (index as a column) and last column (postcode - the above code must
# have added it)
data.drop(columns=[0, 11], inplace=True)
data.columns = col_names[1:11]

print(data.shape)

# Drop duplicates
data = data.drop_duplicates(subset="carId").reset_index(drop=True)

print(data.shape)

# print first five rows of data
data.head()


(24283, 10)
(14427, 10)


Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize,brand,carId
0,3 Series,2017,17950,Automatic,77800,Diesel,62.8,2.0,bmw,29844709
1,X3,2008,4250,Automatic,108000,Diesel,42.2,2.0,bmw,29790037
2,X1,2015,13699,Automatic,50409,Diesel,52.3,2.0,bmw,29991954
3,4 Series,2017,18890,Automatic,59897,Petrol,45.6,2.0,bmw,30024312
4,3 Series,2016,18699,Automatic,61742,Diesel,57.6,2.0,bmw,29991955


In [304]:
# Remove cars of fuel type equal to 'Petrol/Electric'
data = data[data["fuelType"] != "Petrol/Electric"]
data.reset_index(drop=True, inplace=True)

# Create a boolean ev_mask for elements containing the word "Electric"
ev_mask = data.apply(lambda x: x.str.contains('Electric', case=False)).any(axis=1)

ev_mileage = data.loc[ev_mask, "mpg"]
data = data.copy()
data.loc[ev_mask, "mileage"] = ev_mileage

# Clean data from electric cars. Because some fields such as mpg or engineSize doesn't
# really apply to them, the scraping tool didn't get the information right
data.loc[ev_mask, "transmission"] = "Automatic"
data.loc[ev_mask, "mpg"] = 0
data.loc[ev_mask, "fuelType"] = "Electric"
data.loc[ev_mask, "engineSize"] = 0

data[ev_mask].sample(10)

Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize,brand,carId
4018,e-tron,2020,33990,Automatic,28064,Electric,0,0,audi,29806793
3732,e-tron,2021,54690,Automatic,14829,Electric,0,0,audi,29450554
12735,i3,2015,13950,Automatic,41213,Electric,0,0,bmw,29793197
8193,i3,2020,21500,Automatic,11062,Electric,0,0,bmw,29741034
10587,e-tron,2021,42990,Automatic,14319,Electric,0,0,audi,29900238
1644,i3,2020,21470,Automatic,14100,Electric,0,0,bmw,29965971
11599,e-tron,2021,41999,Automatic,14240,Electric,0,0,audi,28842902
1582,e-tron,2021,45500,Automatic,16140,Electric,0,0,audi,29710128
1703,X3,2021,43490,Automatic,19169,Electric,0,0,bmw,30012581
1430,RS e-tron GT,2022,122990,Automatic,117,Electric,0,0,audi,29506663


In [305]:
data["mileage"] = pd.to_numeric(data['mileage'], errors='coerce')
data = data.dropna()
data.reset_index(drop=True, inplace=True)

In [306]:
columns_to_apply = data.columns[data.columns != 'mileage']

# Apply the function to selected columns
none_mask = data[columns_to_apply].apply(lambda x: x.str.contains('None', case=False)).any(axis=1)
int_mask = data['mileage'].apply(lambda x: x.is_integer())
data = data[(~none_mask) & (int_mask)]
data.reset_index(drop=True, inplace=True)

In [307]:
int_cols = ["year", "price", "mileage"]
float_cols = ["mpg", "engineSize"]

data[int_cols] = data[int_cols].astype(int)
data[float_cols] = data[float_cols].astype(float)
data["engineSize"] = round(data["engineSize"],1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13658 entries, 0 to 13657
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         13658 non-null  object 
 1   year          13658 non-null  int64  
 2   price         13658 non-null  int64  
 3   transmission  13658 non-null  object 
 4   mileage       13658 non-null  int64  
 5   fuelType      13658 non-null  object 
 6   mpg           13658 non-null  float64
 7   engineSize    13658 non-null  float64
 8   brand         13658 non-null  object 
 9   carId         13658 non-null  object 
dtypes: float64(2), int64(3), object(5)
memory usage: 1.0+ MB


In [308]:
data.sample(10)

Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize,brand,carId
4741,A3,2013,9995,Manual,72875,Diesel,74.3,1.6,audi,29448364
13212,Q5,2021,41490,Semi-Auto,4114,Petrol,32.5,2.0,audi,29976883
6341,5 Series,2017,22950,Automatic,54000,Diesel,53.3,3.0,bmw,30011881
1520,A3,2013,7290,Manual,95487,Diesel,74.3,1.6,audi,30029276
6506,X2,2018,24188,Automatic,32372,Diesel,58.9,2.0,bmw,29997599
2527,A4,2018,20000,Automatic,32978,Diesel,67.3,2.0,audi,29808546
8724,Q5,2013,15495,Automatic,68041,Diesel,47.1,2.0,audi,29936317
7001,A1,2017,9995,Manual,74522,Petrol,67.3,1.0,audi,29944221
10059,A1,2013,6490,Manual,94964,Diesel,74.3,1.6,audi,29906833
12933,A3,2017,16999,Manual,51574,Petrol,49.6,2.0,audi,29881982


In [309]:
data["brand"].value_counts()

audi    7101
bmw     6557
Name: brand, dtype: int64