In [43]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import mysql.connector
import pymysql.cursors
from scipy.sparse import save_npz
import joblib

In [2]:
import pymysql.cursors

conn = None
try:
    conn = pymysql.connect(
        host='localhost',
        user='root',
        password='',
        database='autonexus',
        cursorclass=pymysql.cursors.DictCursor 
    )

    print("PyMySQL connection successful!")

    with conn.cursor() as cursor:
        cursor.execute("SELECT * FROM cars")
        cars = cursor.fetchall()

except pymysql.Error as e:
    print(f"An error occurred: {e}")

finally:
    if conn:
        conn.close()

PyMySQL connection successful!


In [5]:
pd.set_option('display.max_columns', None)

In [6]:
df = pd.DataFrame(data=cars)

In [7]:
df

Unnamed: 0,index,Model_Year,Brand_Name,Model_Name,Image_List,Stock_Type,Mileage,Price,Exterior_Color,Interior_Color,Drivetrain,Km_per_l,Fuel_Type,Accidents_Or_Damage,Clean_Title,One_Owner_Vehicle,Personal_Use_Only,Level2_Charging,Dc_Fast_Charging,Battery_Capacity,Expected_Range,Seller_Name,Seller_Site,Gear_Spec,Engine_Size,Cylinder_Config,Valves,Km_L_e_City,Km_L_e_Hwy,Street_Address,ZIP,City,STATE,ST,lat,LONG
0,1,2022,Ford,F-150 Lightning Platinum,"[""https://platform.cstatic-images.com/xlarge/i...",Used,27565.0,48500.0,gray,gray,AWD,0.0,Electric,0,1,one,no,19.0,41.0,131.0,466.6,dennis sneed ford,http://www.sneedford.com/?utm_source=cars.com&...,0,0.0,0,0,31.0,26.0,1046 SW US Highway 169,64454,gower,missouri,mo,39.6020,-94.5965
1,2,2025,Ford,Maverick XLT,"[""https://platform.cstatic-images.com/xlarge/i...",New,3.0,35285.0,gray,gray,AWD,11.0,Gasoline,0,1,not_owned_yet,not_in_use_yet,0.0,0.0,0.0,0.0,bening ford of perryville,http://beningford.com/?utm_source=cars.com&utm...,8,2.0,I4,16,0.0,0.0,909 S Perryville Blvd,63775,perryville,missouri,mo,37.7174,-89.8737
2,3,2014,Ford,F-150 STX,"[""https://platform.cstatic-images.com/xlarge/i...",Used,117487.0,16998.0,black,black,4WD,5.0,Flex Fuel,1,1,more,yes,0.0,0.0,0.0,0.0,patriot motors,http://www.patriotmotorsrt5.com?utm_source=car...,6,5.0,V8,32,0.0,0.0,3306 State Rte 5,44410,cortland,ohio,oh,41.3251,-80.7327
3,4,2023,Chevrolet,Tahoe 4WD Z71,"[""https://platform.cstatic-images.com/xlarge/i...",Used,63052.0,54495.0,black,black,4WD,7.0,Gasoline,0,1,one,yes,0.0,0.0,0.0,0.0,granbury nissan,http://www.granburynissan.com/?utm_source=cars...,10,6.2,V8,16,0.0,0.0,4601 E Hwy 377,76049,granbury,texas,tx,32.4488,-97.7285
4,5,2024,Audi,Q5 45 S line quattro Premium,"[""https://platform.cstatic-images.com/xlarge/i...",Certified,9531.0,40204.0,gray,black,AWD,11.0,Gasoline,0,1,more,no,0.0,0.0,0.0,0.0,harper audi,https://www.audiknoxville.com?utm_source=cars....,7,2.0,I4,16,0.0,0.0,9735 Kingston Pike,37922,knoxville,tennessee,tn,35.8580,-84.1194
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98752,98753,2026,Kia,Carnival SX,"[""https://platform.cstatic-images.com/xlarge/i...",New,9.0,50135.0,silver,,FWD,9.0,Gasoline,0,1,not_owned_yet,not_in_use_yet,0.0,0.0,0.0,0.0,wagner kia of shrewsbury,http://www.wagnerkiaofshrewsbury.com?utm_sourc...,8,3.5,V6,24,0.0,0.0,730 Boston Turnpike,1545,shrewsbury,massachusetts,ma,42.2848,-71.7205
98753,98754,2015,Jeep,Wrangler Unlimited Sahara,"[""https://platform.cstatic-images.com/xlarge/i...",Used,91549.0,18500.0,black,black,4WD,8.0,Gasoline,1,1,more,yes,0.0,0.0,0.0,0.0,family kia,https://www.familykia.com/?utm_source=cars.com...,5,3.6,V6,24,0.0,0.0,2665 US Highway 1 South,32086,saint augustine,florida,fl,29.8285,-81.3237
98754,98755,2020,GMC,Sierra 1500 Elevation,"[""https://platform.cstatic-images.com/xlarge/i...",Used,40020.0,38606.0,red,gray,4WD,8.0,Gasoline,0,1,more,yes,0.0,0.0,0.0,0.0,everett buick gmc,http://www.everettbgmc.com/?utm_source=cars.co...,10,5.3,V8,16,0.0,0.0,21115 Interstate 30,72022,bryant,arkansas,ar,34.6068,-92.4920
98755,98756,2026,Honda,CR-V EX-L,"[""https://platform.cstatic-images.com/xlarge/i...",New,2.0,36468.0,white,gray,FWD,13.0,Gasoline,0,1,not_owned_yet,not_in_use_yet,0.0,0.0,0.0,0.0,coggin honda of orlando,http://cogginhondaorlando.com?utm_source=cars....,8,1.5,I4,16,0.0,0.0,11051 Orange Blossom Trail,32837,orlando,florida,fl,28.3949,-81.4179


In [None]:
cols_not_useful_in_recommendation = ['Image_List', 'Level2_Charging', 'Seller_Site',
                         'Street_Address', 'City', 'ZIP', 'lat', 'LONG', 'STATE', 'Exterior_Color', 'Interior_Color', 'index']


df.drop(columns=cols_not_useful_in_recommendation, inplace=True)

In [22]:
ohe_cols = ['Brand_Name', 'Stock_Type', 'Drivetrain', 'Fuel_Type', 'One_Owner_Vehicle',
            'Personal_Use_Only', 'Gear_Spec', 'Cylinder_Config', 'Valves', 'ST']

tfidf_cols = ['Model_Name', 'Seller_Name']

min_max_cols = ['Model_Year', 'Mileage', 'Price','Km_per_l', 'Dc_Fast_Charging', 'Battery_Capacity',
                'Expected_Range', 'Engine_Size', 'Km_L_e_City', 'Km_L_e_Hwy']

In [23]:
transformer = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(), ohe_cols),
        ('tfidf_model', TfidfVectorizer(stop_words='english'), tfidf_cols[0]),
        ('tfidf_seller', TfidfVectorizer(stop_words='english'), tfidf_cols[1]),
        ('norm', MinMaxScaler(), min_max_cols)
    ],
    remainder='passthrough',
    force_int_remainder_cols=False
)

In [25]:
transformed_df = transformer.fit_transform(df)



In [27]:
input_vec = df.sample()
input_vec

Unnamed: 0,Model_Year,Brand_Name,Model_Name,Stock_Type,Mileage,Price,Drivetrain,Km_per_l,Fuel_Type,Accidents_Or_Damage,Clean_Title,One_Owner_Vehicle,Personal_Use_Only,Dc_Fast_Charging,Battery_Capacity,Expected_Range,Seller_Name,Gear_Spec,Engine_Size,Cylinder_Config,Valves,Km_L_e_City,Km_L_e_Hwy,ST
81772,2025,Toyota,Tacoma Limited,New,2.0,42875.0,4WD,9.0,Gasoline,0,1,not_owned_yet,not_in_use_yet,0.0,0.0,0.0,clearwater toyota,8,2.4,I4,16,0.0,0.0,fl


In [29]:
similarity_score = cosine_similarity(transformer.transform(input_vec), transformed_df)

In [38]:
idx = np.argsort(similarity_score.ravel())[-6:][::-1][1:]

In [40]:
df.iloc[idx,:]

Unnamed: 0,Model_Year,Brand_Name,Model_Name,Stock_Type,Mileage,Price,Drivetrain,Km_per_l,Fuel_Type,Accidents_Or_Damage,Clean_Title,One_Owner_Vehicle,Personal_Use_Only,Dc_Fast_Charging,Battery_Capacity,Expected_Range,Seller_Name,Gear_Spec,Engine_Size,Cylinder_Config,Valves,Km_L_e_City,Km_L_e_Hwy,ST
5207,2025,Toyota,Tacoma SR5,New,0.0,40997.0,4WD,9.0,Gasoline,0,1,not_owned_yet,not_in_use_yet,0.0,0.0,0.0,toyota of orlando,8,2.4,I4,16,0.0,0.0,fl
2056,2025,Toyota,Tacoma SR5,New,0.0,39644.0,4WD,9.0,Gasoline,0,1,not_owned_yet,not_in_use_yet,0.0,0.0,0.0,toyota of orlando,8,2.4,I4,16,0.0,0.0,fl
3939,2025,Toyota,Tacoma SR5,New,10.0,42913.0,4WD,9.0,Gasoline,0,1,not_owned_yet,not_in_use_yet,0.0,0.0,0.0,sun toyota,8,2.4,I4,16,0.0,0.0,fl
71762,2025,Toyota,Tacoma SR5,New,0.0,42519.0,4WD,9.0,Gasoline,0,1,not_owned_yet,not_in_use_yet,0.0,0.0,0.0,toyota of clermont,8,2.4,I4,16,0.0,0.0,fl
83389,2025,Toyota,Tacoma SR5,New,10.0,44238.0,4WD,9.0,Gasoline,0,1,not_owned_yet,not_in_use_yet,0.0,0.0,0.0,autonation toyota fort myers,8,2.4,I4,16,0.0,0.0,fl


In [42]:
save_npz('../data/recommendation/transformed_df.npz',transformed_df)

In [None]:
joblib.dump(transformer, '../models/recommendation_transformer.joblib')

['../models/recommendation_transformer.joblib']

In [3]:
joblib.load('../models/recommendation_transformer.joblib')

0,1,2
,transformers,"[('ohe', ...), ('tfidf_model', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False
