In [1]:
import csv
import os
import re
import warnings

import pandas as pd
import numpy as np

from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.model_selection import (
    StratifiedKFold,
    GridSearchCV,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (
    PolynomialFeatures,
    OneHotEncoder,
    FunctionTransformer,
    MinMaxScaler,
)

from joblib import dump, load

In [2]:
# Load data
path_data = "../data/UK used cars/"

# list of files to load
files_to_load = ["audi.csv", "bmw.csv", "vw.csv"]

# empty list to store data
data_list = []

# loop through files in directory
for file in os.listdir(path_data):
    # check if file is in list of files to load
    if file in files_to_load:
        # open file and read data
        with open(os.path.join(path_data, file), newline="") as f:
            # create csv reader object
            reader = csv.reader(f)
            # iterate over rows in the csv file and add filename to each row
            reader_data = [row + [file.split(".")[0]] for row in reader]
            # store column names and data in data_list
            col_names = reader_data[0]
            data_list.append(reader_data[1:])

# concatenate data from all files into one dataframe
data_kaggle = pd.concat(
    [pd.DataFrame(data_list[0]), pd.DataFrame(data_list[1]), pd.DataFrame(data_list[2])]
).reset_index(drop=True)

# rename last column to "brand"
col_names[-1] = "brand"
data_kaggle.columns = col_names

# Drop tax column as it isn't used and it isn't part of the scraped dataset
data_kaggle.drop(columns=["tax"], inplace=True)
data_kaggle.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize,brand
0,5 Series,2014,11200,Automatic,67068,Diesel,57.6,2.0,bmw
1,6 Series,2018,27000,Automatic,14827,Petrol,42.8,2.0,bmw
2,5 Series,2016,16000,Automatic,62794,Diesel,51.4,3.0,bmw
3,1 Series,2017,12750,Automatic,26676,Diesel,72.4,1.5,bmw
4,7 Series,2014,14500,Automatic,39554,Diesel,50.4,3.0,bmw


In [3]:
# previous_files = [x for x in os.listdir("../data/Scraped data/") if ".parquet" in x]

# if previous_files:
#     sorted_files = sorted(
#         previous_files, key=lambda x: int(re.search(r"\d+", x).group())
#     )
#     latest_file = sorted_files[-1]

#     data_scraped = pd.read_parquet(f"../data/Scraped data/{latest_file}")

# data = pd.concat([data_kaggle, data_scraped])
data = data_kaggle.copy()

In [4]:

# Transform data types after importing data
int_cols = ["year", "price", "mileage"]
float_cols = ["mpg", "engineSize"]

data[int_cols] = data[int_cols].astype(int)
data[float_cols] = data[float_cols].astype(float)

# Drop duplicates
data.drop_duplicates(ignore_index=True, inplace=True)


# Remove outliers based on "year" feature
data = data[data["year"] > 2005].reset_index(drop=True)

# Remove outliers based on "price" feature
mask_audi = (data["brand"] == "audi") & (data["price"] < 75000)
mask_vw = (data["brand"] == "vw") & (data["price"] < 60000)
mask_bmw = (data["brand"] == "bmw") & (data["price"] < 100000)
mask_brand = mask_audi | mask_vw | mask_bmw
data = data[mask_brand].reset_index(drop=True)

# Remove outliers based on "mileage" feature
data = data[data["mileage"] < 150000].reset_index(drop=True)

# Remove outliers based on "MPG" feature
data = data[(data["mpg"] > 18) & (data["mpg"] < 200)].reset_index(drop=True)

# Remove outliers based on "engineSize" feature
data = data[(data["engineSize"] > 1) & (data["engineSize"] < 5.2)].reset_index(
    drop=True
)

mask = (data["fuelType"] == "Other") | (data["fuelType"] == "Electric")
data = data[~mask].reset_index(drop=True)

# Drop irrelevant columns
data.drop(columns=["model"], inplace=True)


In [5]:
# Load sample data
sample_data = pd.read_parquet("sample_data-v1.parquet")

# Define features to be one-hot-encoded, log transformed and non-transformed
ohe_cols = ["transmission", "fuelType", "brand"]
log_cols = ["price", "mileage"]
log_cols_transformed = [column + "_log" for column in log_cols]
non_transformed_cols = [
    column
    for column in data.columns.tolist()
    if (column not in ohe_cols) & (column not in log_cols)
]

# Create data transformer. Note that these 
log_transformer = FunctionTransformer(func=np.log, inverse_func=np.exp, validate=True)
transformer = make_column_transformer(
    (log_transformer, log_cols),
    (OneHotEncoder(drop="first"), ohe_cols),
    remainder="passthrough",
)

# Transform data
transformer.fit(sample_data)
transformed = transformer.transform(data)

# Define column names of new columns created after one-hot-encoding transformation
ohe_cols_transformed = (
    transformer.named_transformers_["onehotencoder"].get_feature_names_out().tolist()
)
# Define the name of all the new columns
all_transformed_cols = log_cols_transformed + ohe_cols_transformed + non_transformed_cols

# Define index of column containing fuelType_Hybrid data. It will be used in
# train_test_split as the stratified variable
hybrid_idx = (log_cols + ohe_cols_transformed).index("fuelType_Hybrid")



In [21]:
X_data = transformed[:, 1:]
y_data = transformed[:, 0]

X_train, X_test, y_train, y_test = train_test_split(
    X_data,
    y_data,
    test_size=0.25,
    random_state=42,
    shuffle=True,
    stratify=X_data[:, hybrid_idx - 1],
)

num_splits = 5
sk_fold = StratifiedKFold(n_splits=num_splits, random_state=42, shuffle=True)
folds = sk_fold.split(X_train, X_train[:, hybrid_idx])

# Define pipeline steps
steps = [
    ("scaler", MinMaxScaler()),
    ("pol_features", PolynomialFeatures()),
    ("model", LinearRegression()),
]

# Create pipeline object
pipeline = Pipeline(steps=steps)

param_grid = {"pol_features__degree": range(1, 5)}

# Define metric for Grid Search process
# The rmse of the log transformed prices isn't the same as the rmse of the actual prices
# Therefore, I will create a metric that computes the rmse of the actual prices and
# predictions to select the best performing model
def rmse(y_true, y_pred):
    # Compute the rmse of log prices
    rmse_log = np.sqrt(mean_squared_error(y_true, y_pred))
    rmse_value = rmse_log
    #  Sometimes, if the degree is too high for an accurate model, the rmse will yield
    # cray values exceeded 10000 (for reference, for the log prices the average rmse is 
    # 0.2)
    # If the value is less than 1000, I will compute the rmse of the actual prices.
    # Otherwise, the output of the function is the rmse of log prices
    if rmse_log < 1000:
        rmse_exp = np.sqrt(mean_squared_error(np.exp(y_true), np.exp(y_pred)))
        rmse_value = rmse_exp
    return rmse_value
rmse_scorer = make_scorer(rmse, greater_is_better=False)




regression_model = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring=rmse_scorer,
    cv=folds,
    n_jobs=-1,
    verbose=3,
)

regression_model.fit(X_train, y_train)

# regression_model = GridSearchCV(
#     estimator=pipeline,
#     param_grid=param_grid,
#     scoring={'rmse': "neg_root_mean_squared_error", 'rmse_def': rmse_scorer},
#     refit="rmse",
#     cv=folds,
#     n_jobs=-1,
#     verbose=3,
# )


# dump(regression_model.best_estimator_, os.getcwd() + "/car-price-v1.joblib")

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 2/5] END ...........pol_features__degree=1;, score=-0.172 total time=   0.0s
[CV 1/5] END ...........pol_features__degree=1;, score=-0.171 total time=   0.0s
[CV 3/5] END ...........pol_features__degree=1;, score=-0.174 total time=   0.0s
[CV 4/5] END ...........pol_features__degree=1;, score=-0.173 total time=   0.0s
[CV 5/5] END ...........pol_features__degree=1;, score=-0.171 total time=   0.0s
[CV 1/5] END ...........pol_features__degree=2;, score=-0.147 total time=   0.1s[CV 3/5] END ...........pol_features__degree=2;, score=-0.148 total time=   0.1s

[CV 2/5] END ...........pol_features__degree=2;, score=-0.146 total time=   0.1s
[CV 4/5] END ...........pol_features__degree=2;, score=-0.150 total time=   0.2s
[CV 5/5] END ...........pol_features__degree=2;, score=-0.145 total time=   0.2s
[CV 4/5] END ...........pol_features__degree=3;, score=-0.146 total time=   0.8s
[CV 1/5] END ...........pol_features__degree=3;, 

In [22]:
regression_model.best_params_

{'pol_features__degree': 3}

In [24]:
regression_model.best_estimator_.named_steps[
        "pol_features"
    ].degree

3

In [None]:
predictions = regression_model.predict(X_test)

In [None]:
r2_score(np.exp(y_test), np.exp(predictions))

In [None]:
model = load(os.getcwd() + "/car-price-v1.joblib")
r2_test = r2_score(y_test, model.predict(X_test))
r2_all = r2_score(y_data, model.predict(X_data))

# print(f"R2 score of predictions vs true values (test data) for one-hot-encoded brands is {r2_test_brands:.3f}")
print(f"R2 score of predictions vs true values (test data) {r2_test:.3f}")
print(f"R2 score of predictions vs true values (training and test data) {r2_all:.3f}")


In [None]:
np.exp(y_test)-np.round(np.exp(model.predict(X_test)))

In [None]:
np.round(np.exp(model.predict(X_test)))

In [None]:
clean_data = pd.read_parquet("clean_data.parquet")

In [None]:
clean_data["fuelType"].value_counts()

In [None]:
hybrid_data = clean_data[clean_data["fuelType"] == "Hybrid"].reset_index(drop=True)

In [None]:
hybrid_data[hybrid_data["brand"] == "audi"]