In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("../data/autovit_data.csv")

In [3]:
columns_to_keep = [
    "price",
    "make",
    "model",
    "year",
    "mileage",
    "fuel_type",
    "engine_power",
    "engine_capacity",
    "transmission",
    "gearbox",
    "body_type",
]
df = df[columns_to_keep]
df.columns

Index(['price', 'make', 'model', 'year', 'mileage', 'fuel_type',
       'engine_power', 'engine_capacity', 'transmission', 'gearbox',
       'body_type'],
      dtype='object')

In [4]:
df = df.dropna()
df = df.drop_duplicates(keep='first').reset_index()
cars = df.copy()

In [5]:
# convert price to integer
df["price"] = df["price"].astype(int)

In [6]:
df.head(2)

Unnamed: 0,index,price,make,model,year,mileage,fuel_type,engine_power,engine_capacity,transmission,gearbox,body_type
0,0,10500,Ford,Kuga,2015,214 547 km,Diesel,140 CP,1 997 cm3,4x4 (manual),Automata,SUV
1,1,28600,Audi,Q3,2021,36 000 km,Benzina,190 CP,1 984 cm3,4x4 (automat),Automata,SUV


In [7]:
# convert mileage to integer
df["mileage"] = df["mileage"].str.replace(" km", "").str.replace(" ", "").astype(int)

# convert engine_power to integer
df["engine_power"] = df["engine_power"].str.replace(" CP", "").astype(int)

# convert engine_capacity to integer
df["engine_capacity"] = df["engine_capacity"].str.replace(" cm3", "").str.replace(" ", "").astype(int)

In [8]:
df.head(2)

Unnamed: 0,index,price,make,model,year,mileage,fuel_type,engine_power,engine_capacity,transmission,gearbox,body_type
0,0,10500,Ford,Kuga,2015,214547,Diesel,140,1997,4x4 (manual),Automata,SUV
1,1,28600,Audi,Q3,2021,36000,Benzina,190,1984,4x4 (automat),Automata,SUV


In [9]:
# convert columns to categorical
df["fuel_type"] = df["fuel_type"].astype("category")
df["transmission"] = df["transmission"].astype("category")
df["gearbox"] = df["gearbox"].astype("category")
df["body_type"] = df["body_type"].astype("category")

# perform simple one-hot encoding
df_encoded = pd.get_dummies(df, columns=["fuel_type", "transmission", "gearbox", "body_type"])

In [10]:
df_encoded.head(2)

Unnamed: 0,index,price,make,model,year,mileage,engine_power,engine_capacity,fuel_type_Benzina,fuel_type_Benzina + CNG,...,gearbox_Manuala,body_type_Cabrio,body_type_Combi,body_type_Compacta,body_type_Coupe,body_type_Masina de oras,body_type_Masina mica,body_type_Monovolum,body_type_SUV,body_type_Sedan
0,0,10500,Ford,Kuga,2015,214547,140,1997,False,False,...,False,False,False,False,False,False,False,False,True,False
1,1,28600,Audi,Q3,2021,36000,190,1984,True,False,...,False,False,False,False,False,False,False,False,True,False


In [11]:
df_encoded = df_encoded.replace({False: 0, True: 1})
df_encoded.head(2)

Unnamed: 0,index,price,make,model,year,mileage,engine_power,engine_capacity,fuel_type_Benzina,fuel_type_Benzina + CNG,...,gearbox_Manuala,body_type_Cabrio,body_type_Combi,body_type_Compacta,body_type_Coupe,body_type_Masina de oras,body_type_Masina mica,body_type_Monovolum,body_type_SUV,body_type_Sedan
0,0,10500,Ford,Kuga,2015,214547,140,1997,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,28600,Audi,Q3,2021,36000,190,1984,1,0,...,0,0,0,0,0,0,0,0,1,0


In [12]:
# encode make and model with LabelEncoder
label_encoder = LabelEncoder()
df_encoded["make"] = label_encoder.fit_transform(df_encoded["make"])
df_encoded["model"] = label_encoder.fit_transform(df_encoded["model"])

In [13]:
df_encoded.head(2)

Unnamed: 0,index,price,make,model,year,mileage,engine_power,engine_capacity,fuel_type_Benzina,fuel_type_Benzina + CNG,...,gearbox_Manuala,body_type_Cabrio,body_type_Combi,body_type_Compacta,body_type_Coupe,body_type_Masina de oras,body_type_Masina mica,body_type_Monovolum,body_type_SUV,body_type_Sedan
0,0,10500,8,203,2015,214547,140,1997,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,28600,1,266,2021,36000,190,1984,1,0,...,0,0,0,0,0,0,0,0,1,0


In [14]:
df_encoded.to_pickle("../data/encoded_autovit_data.pkl")
cars.to_pickle("../data/car_names_autovit.pkl")

In [15]:
def get_top_similar_rows(df, target_row_index, k=5):
    target_row = df.iloc[[target_row_index]]
    df = df.drop(df.index[target_row_index])
    similarities = cosine_similarity(target_row, df)

    sim_df = pd.DataFrame(similarities[0], columns=["Similarity"])
    sim_df["Index"] = df.index

    top_similar_indices = sim_df.nlargest(k, "Similarity")["Index"]
    return sim_df.iloc[top_similar_indices].sort_values(by="Similarity", ascending=False)

In [16]:
sim_df = get_top_similar_rows(df_encoded, 0, 10)
sim_df

Unnamed: 0,Similarity,Index
91,0.999875,92
98,0.999759,99
11,0.999624,12
83,0.999611,84
87,0.999361,88
23,0.999261,24
147,0.998119,148
27,0.99358,28
6,0.986514,7
18,0.97343,19


In [17]:
indices_to_retrieve = sim_df["Index"].tolist()
cars.loc[indices_to_retrieve]

Unnamed: 0,index,price,make,model,year,mileage,fuel_type,engine_power,engine_capacity,transmission,gearbox,body_type
92,1029,12490.0,Ford,Ranger,2014,196 000 km,Diesel,150 CP,2 198 cm3,4x4 (automat),Manuala,SUV
99,1055,5990.0,BMW,Seria 3,2009,218 000 km,Diesel,143 CP,1 995 cm3,Spate,Automata,Combi
12,17,4321.0,Kia,Ceed,2012,200 285 km,Benzina + GPL,100 CP,1 396 cm3,Fata,Manuala,Combi
84,1021,4690.0,Citroën,C4,2011,218 000 km,Diesel,110 CP,1 560 cm3,Fata,Manuala,Compacta
88,1025,2990.0,Volkswagen,Polo,2007,220 000 km,Benzina,80 CP,1 390 cm3,Fata,Manuala,Masina de oras
24,174,4700.0,Dacia,Lodgy,2016,420 000 km,Diesel,90 CP,1 461 cm3,Fata,Manuala,Monovolum
148,1582,6950.0,Renault,Megane,2012,68 921 km,Benzina,110 CP,1 598 cm3,Fata,Manuala,Compacta
28,192,27989.0,Mercedes-Benz,C,2020,171 000 km,Hibrid,194 CP,1 950 cm3,Spate,Automata,Sedan
7,10,31000.0,BMW,X5,2015,143 400 km,Diesel,313 CP,2 993 cm3,4x4 (automat),Automata,SUV
19,45,54900.0,BMW,Seria 7,2020,191 000 km,Diesel,265 CP,2 993 cm3,4x4 (automat),Automata,Sedan
