In [1]:
import numpy as np
import polars as pl
from sklearn.cluster import HDBSCAN
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
features = pl.read_csv("CleanData/features.csv")
listings = pl.read_csv("CleanData/listings.csv")
mapping = pl.read_csv("CleanData/id_mapping.csv")

In [3]:
features = features.drop(
    [
        "id_make",
        "niceName_make",
        "niceName_model",
        "id_engine",
        "id_transmission",
        "name_transmission",
        "numberOfSpeeds_transmission",
        "vehicleType_categories",
        "vehicleStyle_categories",
        "automaticType_transmission",
        "errorType",
        "message",
    ]
)

In [4]:
new_data = (
    listings.select(["vin", "modelId", "priceUnformatted"])
    .join(features, on="vin", how="inner")
    .drop("vin")
)

In [5]:
num_cols = new_data.select(pl.col([pl.Float64, pl.Int64])).drop("modelId").columns
str_cols = new_data.select(pl.col(pl.String)).columns
cls_data = new_data

for col in num_cols:
    mean = cls_data[col].mean()
    std = cls_data[col].std()

    cls_data = cls_data.with_columns(((cls_data[col] - mean) / std).alias(col))

cls_data = cls_data.to_dummies(str_cols)

In [6]:
hdb = HDBSCAN(
    min_cluster_size=40,
    metric="euclidean",
    algorithm="auto",
    n_jobs=4,
    max_cluster_size=100,
    cluster_selection_method="leaf",
    leaf_size=50,
)
hdb.fit(cls_data)

In [7]:
cls_data = cls_data.with_columns(segment=hdb.labels_)
cls_data = cls_data.join(mapping, on="modelId", how="inner").drop(["make", "model"])
# cls_data.write_csv('ClusteringData/data.csv')

In [9]:
cls_data.head()

modelId,priceUnformatted,compressionRatio_engine,cylinder_engine,size_engine,configuration_engine_V,configuration_engine_W,configuration_engine_flat,configuration_engine_inline,configuration_engine_null,horsepower_engine,torque_engine,type_engine_diesel,type_engine_electric,type_engine_electric (fuel cell),type_engine_flex-fuel (FFV),type_engine_gas,type_engine_hybrid,type_engine_mild hybrid,type_engine_natural gas (CNG),type_engine_null,type_engine_plug-in hybrid,compressorType_engine_naturally aspired,compressorType_engine_null,compressorType_engine_supercharger,compressorType_engine_turbocharger,compressorType_engine_twin turbocharger,compressorType_engine_twincharger,transmissionType_transmission_AUTOMATED_MANUAL,transmissionType_transmission_AUTOMATIC,transmissionType_transmission_DIRECT_DRIVE,transmissionType_transmission_MANUAL,transmissionType_transmission_null,drivenWheels_all wheel drive,drivenWheels_four wheel drive,drivenWheels_front wheel drive,drivenWheels_null,…,"market_categories_Luxury,High-Performance,Crossover,Exotic","market_categories_Luxury,High-Performance,Crossover,Hatchback","market_categories_Luxury,High-Performance,Exotic","market_categories_Luxury,High-Performance,Flex Fuel,Crossover","market_categories_Luxury,High-Performance,Hatchback","market_categories_Luxury,Hybrid","market_categories_Luxury,Hybrid,Crossover","market_categories_Luxury,Hybrid,Crossover,Performance","market_categories_Luxury,Hybrid,Hatchback","market_categories_Luxury,Performance","market_categories_Luxury,Performance,Hatchback",market_categories_Performance,"market_categories_Performance,Hatchback",market_categories_null,epaClass_categories_Cargo Vans,epaClass_categories_Compact Cars,epaClass_categories_Large Cars,epaClass_categories_Midsize Cars,epaClass_categories_Midsize Station Wagons,epaClass_categories_Mini Compact Cars,epaClass_categories_Minivan,epaClass_categories_Passenger Vans,epaClass_categories_SMALL_PICKUP_TRUCKS,epaClass_categories_SPORT_UTILITY_VEHICLES,epaClass_categories_Small Pickup Trucks,epaClass_categories_Small Sport Utility Vehicles,epaClass_categories_Small Station Wagons,epaClass_categories_Sport Utility Vehicles,epaClass_categories_Standard Pickup Trucks,epaClass_categories_Standard Sport Utility Vehicles,epaClass_categories_Subcompact Cars,epaClass_categories_Two Seaters,epaClass_categories_null,highway_mpg,city_mpg,segment,make_model
f64,f64,f64,f64,f64,u8,u8,u8,u8,u8,f64,f64,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,…,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,f64,f64,i32,str
1342.0,-0.723844,0.350143,-0.428888,-0.282711,0,0,0,1,0,-0.534867,-0.736781,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.542473,0.055143,83,"""Acura ILX"""
1342.0,-0.566745,0.350143,-0.428888,-0.282711,0,0,0,1,0,-0.534867,-0.736781,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.542473,0.055143,83,"""Acura ILX"""
1349.0,-0.065878,-0.697073,-0.428888,-1.037598,0,0,0,1,0,-0.546461,-0.620871,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.963756,0.954803,82,"""Acura Integra"""
1349.0,0.209444,-0.697073,-0.428888,-1.037598,0,0,0,1,0,-0.546461,-0.620871,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.823329,0.80486,82,"""Acura Integra"""
1349.0,0.209444,-0.697073,-0.428888,-1.037598,0,0,0,1,0,-0.546461,-0.620871,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.823329,0.80486,82,"""Acura Integra"""


In [20]:
cls_data.select(["make_model", "segment"]).unique()["make_model"].value_counts()

make_model,count
str,u32
"""Honda Accord H…",2
"""Chevrolet Silv…",1
"""Jeep Grand Che…",1
"""Honda Fit""",1
"""BMW X3 M""",2
"""Chevrolet Expr…",2
"""Lexus RX 500h""",1
"""Hyundai Velost…",1
"""Lexus RC 350""",1
"""Chevrolet Spar…",1


In [17]:
segments

[82]

## Deployment test

In [15]:
make_model = "Porsche 911"
segments = (
    cls_data.filter(pl.col("make_model") == make_model)["segment"].unique().to_list()
)

general_repr = (
    cls_data.filter(pl.col("make_model") == make_model)
    .drop(["modelId", "make_model", "segment"])
    .mean()
    .to_numpy()
)
general_repr = general_repr.reshape(1, -1)
general_repr = np.nan_to_num(general_repr, nan=0)

In [17]:
scores = []
filtered_dt = cls_data.filter(pl.col("segment").is_in(segments)).filter(
    pl.col("make_model") != make_model
)
for i in range(filtered_dt.shape[0]):
    pair = filtered_dt.drop(["modelId", "make_model", "segment"]).to_numpy()[i]
    pair = np.nan_to_num(pair, nan=0).reshape(1, -1)
    scores.append(cosine_similarity(general_repr, pair).item())

In [18]:
dummy = pl.concat(
    [filtered_dt.select("make_model"), pl.DataFrame(scores, schema=["scores"])],
    how="horizontal",
).sort("scores", descending=True)
recs = []
for i in range(len(dummy["make_model"].to_list())):
    if dummy["make_model"].to_list()[i] not in recs:
        recs.append(dummy["make_model"].to_list()[i])
    if len(recs) == 5:
        break

In [19]:
recs

['Porsche Panamera',
 'Ford F-250 Super Duty',
 'GMC Sierra 2500HD',
 'GMC Sierra 3500HD',
 'Ram 2500']