In [1]:
import numpy as np
import polars as pl
from sklearn.cluster import HDBSCAN
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
features = pl.read_csv("CleanData/features.csv")
listings = pl.read_csv("CleanData/listings.csv")
mapping = pl.read_csv("CleanData/id_mapping.csv")

In [3]:
features = features.drop(
    [
        "id_make",
        "niceName_make",
        "niceName_model",
        "id_engine",
        "id_transmission",
        "name_transmission",
        "numberOfSpeeds_transmission",
        "vehicleType_categories",
        "vehicleStyle_categories",
        "automaticType_transmission",
        "errorType",
        "message",
    ]
)

In [4]:
new_data = (
    listings.select(["vin", "modelId", "priceUnformatted"])
    .join(features, on="vin", how="inner")
    .drop("vin")
)

In [5]:
num_cols = new_data.select(pl.col([pl.Float64, pl.Int64])).drop("modelId").columns
str_cols = new_data.select(pl.col(pl.String)).columns
cls_data = new_data

for col in num_cols:
    mean = cls_data[col].mean()
    std = cls_data[col].std()

    cls_data = cls_data.with_columns(((cls_data[col] - mean) / std).alias(col))

cls_data = cls_data.to_dummies(str_cols)

In [6]:
hdb = HDBSCAN(
    min_cluster_size=40,
    metric="euclidean",
    algorithm="auto",
    n_jobs=4,
    max_cluster_size=100,
    cluster_selection_method="leaf",
    leaf_size=50,
)
hdb.fit(cls_data)

In [7]:
cls_data = cls_data.with_columns(segment=hdb.labels_)
cls_data = cls_data.join(mapping, on="modelId", how="inner")
cls_data.write_csv("ClusteringData/data.csv")

In [8]:
make_model = "Honda S2000"
segments = (
    cls_data.filter(pl.col("make_model") == make_model)["segment"].unique().to_list()
)
general_repr = []

for col in cls_data.filter(pl.col("make_model") == make_model).columns:
    if col not in ["modelId", "make_model", "segment"]:
        general_repr.append(
            cls_data.filter(pl.col("make_model") == make_model)[col].mean()
        )

general_repr = np.array(general_repr).reshape(1, -1)

In [9]:
scores = []
filtered_dt = cls_data.filter(pl.col("segment").is_in(segments)).filter(
    pl.col("make_model") != make_model
)
for i in range(filtered_dt.shape[0]):
    pair = filtered_dt.drop(["modelId", "make_model", "segment"]).to_numpy()[i]
    pair = np.nan_to_num(pair, nan=0).reshape(1, -1)
    scores.append(cosine_similarity(general_repr, pair).item())

In [10]:
dummy = pl.concat(
    [filtered_dt.select("make_model"), pl.DataFrame(scores, schema=["scores"])],
    how="horizontal",
).sort("scores", descending=True)
recs = []
for i in range(len(dummy["make_model"].to_list())):
    if dummy["make_model"].to_list()[i] not in recs:
        recs.append(dummy["make_model"].to_list()[i])
    if len(recs) == 5:
        break

In [11]:
recs

['BMW Z4',
 'Toyota GR86',
 'Toyota 86',
 'Mazda MX-5 Miata RF',
 'Porsche Boxster']