In [10]:
import os
import csv
import pyarrow.parquet as pq

import pandas as pd
import numpy as np

In [3]:
postcode_all = [
    "E34JN",  # East London
    "B24QA",  # Birmingham
    "NR13JU",  # Norwich
    "SO140YG",  # Southampton
    "BS11JQ",  # Bristol
    "S14PF",  # Sheffield
    "LS28BH",  # Leeds
    "L34AD",  # Bristol
    "NE77DN",  # Newcastle
    "EH12NG",  # Edinburgh
    "HU67RX",  # Hull
    "EX11SG",  # Exeter
    "CB13EW",  # Cambridge
    "CT12EH",  # Canterbury
    "SA11NU",  # Swansea
    "BT12HB",  # Belfast
]

postcode_all[1:3] + postcode_all[-2:]

['B24QA', 'NR13JU', 'SA11NU', 'BT12HB']

In [12]:

# Define path to data
path_data = "../data/Scraped data/May 2023"

# empty list to store data
data_list = []

# loop through files in directory
for file in os.listdir(path_data):
    # check if file is in list of files to load
    if ".csv" in file:
        # open file and read data
        with open(os.path.join(path_data, file), newline='') as f:
            # create csv reader object
            reader = csv.reader(f)
            # iterate over rows in the csv file and add filename to each row
            reader_data = [row + [file.split(".")[0]] for row in reader]
            # store column names and data in data_list
            col_names = reader_data[0]
            data_list.append(reader_data[1:])

# concatenate data from all files into one dataframe
data = pd.concat([pd.DataFrame(data) for data in data_list]).reset_index(drop=True)

# Drop the first (index as a column) and last column (postcode - the above code must
# have added it)
data.drop(columns=[0, 11], inplace=True)
data.columns = col_names[1:11]

print(data.shape)

# Drop duplicates
data = data.drop_duplicates(subset="carId").reset_index(drop=True)

print(data.shape)

# print first five rows of data
data.head()


(29330, 10)
(17385, 10)


Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize,brand,carId
0,4 Series,2020,27000,Semi-Auto,29698,Petrol,41.5,3,bmw,29933018
1,4 Series,2018,20350,Semi-Auto,31600,Petrol,48.7,2,bmw,29741571
2,2 Series,2015,15495,Semi-Auto,34192,Petrol,47.9,2,bmw,30026514
3,4 Series,2020,26100,Semi-Auto,37431,Petrol,41.5,3,bmw,29843106
4,3 Series,2015,20900,Semi-Auto,42091,Diesel,49.6,3,bmw,29817398


In [13]:
# Remove cars of fuel type equal to 'Petrol/Electric'
data = data[data["fuelType"] != "Petrol/Electric"]
data.reset_index(drop=True, inplace=True)

# Create a boolean ev_mask for elements containing the word "Electric"
ev_mask = data.apply(lambda x: x.str.contains('Electric', case=False)).any(axis=1)

ev_mileage = data.loc[ev_mask, "mpg"]
data = data.copy()
data.loc[ev_mask, "mileage"] = ev_mileage

# Clean data from electric cars. Because some fields such as mpg or engineSize doesn't
# really apply to them, the scraping tool didn't get the information right
data.loc[ev_mask, "transmission"] = "Automatic"
data.loc[ev_mask, "mpg"] = 0
data.loc[ev_mask, "fuelType"] = "Electric"
data.loc[ev_mask, "engineSize"] = 0

data[ev_mask].sample(10)

Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize,brand,carId
5439,i4,2022,48990,Automatic,8196.0,Electric,0,0,bmw,29632602
5766,iX,2022,65000,Automatic,5200.0,Electric,0,0,bmw,29857298
5458,5 Series,2000cc,35650,Automatic,,Electric,0,0,bmw,29891493
6870,X3,2022,51990,Automatic,5659.0,Electric,0,0,bmw,29994474
6509,e-tron,2020,33990,Automatic,28064.0,Electric,0,0,audi,29806793
13828,e-tron,2021,41999,Automatic,14240.0,Electric,0,0,audi,28842902
3334,X3,2021,46290,Automatic,16429.0,Electric,0,0,bmw,30005909
8424,i3,2021,20000,Automatic,7924.0,Electric,0,0,bmw,30017933
1376,C4,2023,32786,Automatic,1000.0,Electric,0,0,citroen,29774774
627,i3,2019,19995,Automatic,20903.0,Electric,0,0,bmw,29899774


In [14]:
data["mileage"] = pd.to_numeric(data['mileage'], errors='coerce')
data = data.dropna()
data.reset_index(drop=True, inplace=True)

In [15]:
columns_to_apply = data.columns[data.columns != 'mileage']

# Apply the function to selected columns
none_mask = data[columns_to_apply].apply(lambda x: x.str.contains('None', case=False)).any(axis=1)
int_mask = data['mileage'].apply(lambda x: x.is_integer())
data = data[(~none_mask) & (int_mask)]
data.reset_index(drop=True, inplace=True)

In [16]:
int_cols = ["year", "price", "mileage"]
float_cols = ["mpg", "engineSize"]

data[int_cols] = data[int_cols].astype(int)
data[float_cols] = data[float_cols].astype(float)
data["engineSize"] = round(data["engineSize"],1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16439 entries, 0 to 16438
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         16439 non-null  object 
 1   year          16439 non-null  int64  
 2   price         16439 non-null  int64  
 3   transmission  16439 non-null  object 
 4   mileage       16439 non-null  int64  
 5   fuelType      16439 non-null  object 
 6   mpg           16439 non-null  float64
 7   engineSize    16439 non-null  float64
 8   brand         16439 non-null  object 
 9   carId         16439 non-null  object 
dtypes: float64(2), int64(3), object(5)
memory usage: 1.3+ MB


In [17]:
data.sample(10)

Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize,brand,carId
13180,X6,2009,10995,Automatic,75073,Diesel,34.0,3.0,bmw,29778469
12,X1,2018,19995,Semi-Auto,35226,Diesel,60.1,2.0,bmw,29958222
4224,X2,2021,32995,Automatic,8326,Petrol,36.2,2.0,bmw,29188073
14979,X3,2007,4480,Manual,100000,Diesel,39.2,2.0,bmw,29720180
2356,X5,2001,6995,Automatic,45000,Petrol,20.3,4.4,bmw,30004092
2334,5 Series,2018,21990,Automatic,39000,Diesel,49.6,2.0,bmw,29944274
4911,1 Series,2015,8745,Manual,81000,Diesel,83.1,1.5,bmw,28409804
14425,X3,2016,19450,Automatic,67586,Diesel,54.3,2.0,bmw,29573609
12291,A6,2019,25698,Automatic,25997,Diesel,47.9,2.0,audi,29855819
13638,5 Series,2018,25700,Automatic,44717,Diesel,53.3,3.0,bmw,29775210


In [18]:
data["brand"].value_counts()

bmw              8985
audi             6718
vauxhall          111
ford               90
peugeot            68
seat               54
volkswagen         51
toyota             47
mercedes-benz      36
nissan             31
citroen            27
skoda              26
honda              23
fiat               21
land rover         19
renault            19
hyundai            18
kia                18
mini               12
volvo              11
mazda               7
suzuki              5
smart               5
mitsubishi          5
jaguar              4
mg                  3
cupra               3
chevrolet           3
jeep                2
alfa romeo          2
lexus               2
abarth              2
ds                  2
dacia               2
other               1
saab                1
infiniti            1
bentley             1
ssangyong           1
porsche             1
dodge               1
Name: brand, dtype: int64