In [1]:
import os
import csv
import re

import pandas as pd

In [2]:
previous_files = [x for x in os.listdir("../data/Scraped data/") if ".parquet" in x]

if previous_files:
    sorted_files = sorted(
        previous_files, key=lambda x: int(re.search(r"\d+", x).group())
    )
    latest_file = sorted_files[-1]

    latest_version = int(
        max([file_name.split("v")[1].split(".")[0] for file_name in previous_files])
    )
    new_version = latest_version + 1
else:
    new_version = 1

In [3]:
# Define path to data
path_data = "../data/Scraped data/June 2023"

# empty list to store data
data_list = []

# loop through files in directory
for file in os.listdir(path_data):
    # check if file is in list of files to load
    if ".csv" in file:
        # open file and read data
        with open(os.path.join(path_data, file), newline='') as f:
            # create csv reader object
            reader = csv.reader(f)
            # iterate over rows in the csv file and add filename to each row
            reader_data = [row + [file.split(".")[0]] for row in reader]
            # store column names and data in data_list
            col_names = reader_data[0]
            data_list.append(reader_data[1:])

# concatenate data from all files into one dataframe
data = pd.concat([pd.DataFrame(data) for data in data_list]).reset_index(drop=True)

# Drop the first (index as a column) and last column (postcode - the above code must
# have added it)
data.drop(columns=[0, 11], inplace=True)
data.columns = col_names[1:11]

# Drop duplicates
data = data.drop_duplicates(subset="carId").reset_index(drop=True)

# print first five rows of data
data.head()


Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize,brand,carId
0,T-Cross,2021,19990,Manual,8323,Petrol,49.6,1.0,volkswagen,29898913
1,Golf,2022,21991,Manual,8811,Petrol,52.3,1.0,volkswagen,29856808
2,T-Roc,2021,22691,Manual,9001,Petrol,46.3,1.0,volkswagen,29799830
3,T-Roc,2019,23992,Manual,9196,Diesel,50.4,1.6,volkswagen,29676122
4,Golf,2020,30994,Semi-Auto,10882,Hybrid,176.6,1.4,volkswagen,29482261


In [4]:
# Remove cars of fuel type equal to 'Petrol/Electric'
data = data[data["fuelType"] != "Petrol/Electric"]
data.reset_index(drop=True, inplace=True)

# Create a boolean ev_mask for elements containing the word "Electric"
ev_mask = data.apply(lambda x: x.str.contains('Electric', case=False)).any(axis=1)

ev_mileage = data.loc[ev_mask, "mpg"]
data = data.copy()
data.loc[ev_mask, "mileage"] = ev_mileage

# Clean data from electric cars. Because some fields such as mpg or engineSize doesn't
# really apply to them, the scraping tool didn't get the information right
data.loc[ev_mask, "transmission"] = "Automatic"
data.loc[ev_mask, "mpg"] = 0
data.loc[ev_mask, "fuelType"] = "Electric"
data.loc[ev_mask, "engineSize"] = 0

data[ev_mask].sample(10)

Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize,brand,carId
11701,X3,2022,57995,Automatic,3176.0,Electric,0,0,bmw,30051917
7665,3 Series,2993cc,34167,Automatic,,Electric,0,0,bmw,29619668
11706,i3,2015,12950,Automatic,41213.0,Electric,0,0,bmw,29793197
3672,ID3,2022,27990,Automatic,4201.0,Electric,0,0,volkswagen,29282419
3798,ID4,2022,33999,Automatic,4000.0,Electric,0,0,volkswagen,29670190
11702,i3,2015,12289,Automatic,25929.0,Electric,0,0,bmw,29472253
3788,ID5,2022,46990,Automatic,1875.0,Electric,0,0,volkswagen,29295772
11892,X3,2021,49000,Automatic,10146.0,Electric,0,0,bmw,30046948
3845,Golf,2019,15600,Automatic,30136.0,Electric,0,0,volkswagen,30053630
11623,i3,2019,16900,Automatic,33454.0,Electric,0,0,bmw,29273745


In [5]:
# Turn the mileage column into numeric type and remove NaN values
data["mileage"] = pd.to_numeric(data['mileage'], errors='coerce')
data = data.dropna()
data.reset_index(drop=True, inplace=True)

# Create a mask to identify where a value in any columns is equal to "None"
columns_to_apply = data.columns[data.columns != "mileage"]
none_mask = (
    data[columns_to_apply]
    .apply(lambda x: x.str.contains("None", case=False))
    .any(axis=1)
)
# Create a mask to identify where the mileage column is an integer - float values need
# be removed
int_mask = data["mileage"].apply(lambda x: x.is_integer())

# Apply the masks to remove float values in mileage and any values equal to "None"
data = data[(~none_mask) & (int_mask)]
data.reset_index(drop=True, inplace=True)

In [6]:
# Change column type of the columns that are meant to be numeric to int and float types
int_cols = ["year", "price", "mileage"]
float_cols = ["mpg", "engineSize"]
data[int_cols] = data[int_cols].astype(int)
data[float_cols] = data[float_cols].astype(float)

# Round engine size to 1 decimal figure
data["engineSize"] = round(data["engineSize"],1)

# Make sure that categorical columns only contain the right categories
# Fuel type
data = data.loc[
    (data["fuelType"] == "Petrol")
    | (data["fuelType"] == "Diesel")
    | (data["fuelType"] == "Hybrid")
    | (data["fuelType"] == "Electric")
]
data.reset_index(drop=True, inplace=True)
# Transmission
data = data.loc[
    (data["transmission"] == "Automatic")
    | (data["transmission"] == "Manual")
    | (data["transmission"] == "Semi-Auto")
]
data.reset_index(drop=True, inplace=True)

# Replace the brand "volkswagen" with "vw" as this used in the other dataset
data.loc[data["brand"] == "volkswagen", "brand"] = "vw"

data_save = data.copy()
data_save.drop(columns=["carId"], inplace=True)
data_save.to_csv(f"../data/Scraped data/all_scraped_cars-v{new_version}.csv")
data_save.to_parquet(f"../data/Scraped data/all_scraped_cars-v{new_version}.parquet")



In [7]:
data.sample(10)

Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize,brand,carId
19037,A6,2016,16750,Automatic,65600,Diesel,61.4,2.0,audi,29865501
16870,TT,2020,31350,Semi-Auto,24843,Petrol,34.9,2.0,audi,29982822
18859,A1,2019,20000,Automatic,45000,Petrol,40.4,2.0,audi,29915267
18103,Polo,2022,22995,Automatic,5000,Petrol,50.4,1.0,vw,30047461
7589,1 Series,2018,20600,Automatic,75000,Petrol,39.8,3.0,bmw,29871043
20124,M2,2018,29995,Automatic,44263,Petrol,35.8,3.0,bmw,29984425
7891,A3,2019,20700,Automatic,35389,Petrol,41.5,1.5,audi,30034131
12361,T-Roc,2022,31325,Semi-Auto,10384,Petrol,46.3,1.5,vw,29957192
5958,A1,2019,16790,Semi-Auto,49584,Petrol,46.3,1.0,audi,29883221
23891,Golf,2020,17791,Manual,48942,Diesel,62.8,2.0,vw,29649254


In [15]:
set(["a", "b", "c"])

{'a', 'b', 'c'}

In [14]:
set(data["brand"])

{'audi', 'bmw', 'vw'}

In [8]:
data["brand"].value_counts()

bmw     9023
audi    8469
vw      8083
Name: brand, dtype: int64

In [9]:
data["fuelType"].value_counts()

Diesel      12544
Petrol      11627
Hybrid        781
Electric      623
Name: fuelType, dtype: int64

In [10]:
data["transmission"].value_counts()

Automatic    10351
Manual       10337
Semi-Auto     4887
Name: transmission, dtype: int64