In [31]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import warnings
import numpy as np

warnings.filterwarnings("ignore")


In [30]:
df = pd.read_csv('./data/table_details.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,cd_id,cd_type_id,cd_mileage_miles,cd_car_price,cd_year,cd_make,cd_model,cd_body_style,cd_doors,cd_mpg,cd_engine,cd_transmission,cd_drive_type,cd_fuel,cd_tank_size,cd_bed_style,cd_cab_style,cd_path_
0,0,822,,12193.0,39589.0,2021,audi,q5,SUV,4 Doors,/,4 Cyl,Automatic,AWD,Gasoline,,,,audi_q5_2021.jpg
1,1,2092,,82955.0,28589.0,2018,lexus,rx,SUV,4 Doors,/,6 Cyl,Automatic,AWD,Gasoline,,,,lexus_rx_2018.jpg
2,2,862,,43572.0,40589.0,2021,jeep,wrangler,SUV,4 Doors,/,6 Cyl,Automatic,4WD,Gasoline,,,,jeep_wrangler_2021.jpg
3,3,0,,14172.0,32589.0,2022,subaru,wrx,Sedan,4 Doors,19 City / 26 Hwy,4 Cyl,Manual,AWD,Gasoline,,,,subaru_wrx_2022.jpg
4,4,1,,4955.0,29999.0,2022,subaru,wrx,Sedan,4 Doors,19 City / 26 Hwy,4 Cyl,Manual,AWD,Gasoline,,,,subaru_wrx_2022.jpg


Functions to aid pre-processing

In [16]:
def get_mpg(m):
    '''do a regex search and extract miles per gallon'''
    pattern = r'\d+' 
    match = re.findall(pattern, m)
    if match:
        return match
    else:
        return [0,0]

def get_tanksize(tank):
    '''Find tank size'''
    try:
        return float(tank.split(' ')[0])
    except:
        return 0

Appropriate type conversions

In [17]:
df = df[~df.cd_mileage_miles.isna()]

df['cd_mileage_miles'] = df.cd_mileage_miles.astype(float)
df['year'] = df.cd_year.astype(int)

df['cd_doors'] = df.cd_doors.map(lambda x: int(x.split(' ')[0]))

df['mpg_city'] = df['cd_mpg'].apply(lambda x: float(get_mpg(x)[0]))
df['mpg_hwy'] = df['cd_mpg'].apply(lambda x: float(get_mpg(x)[1]))


Tank size Imputation

In [18]:
# Before Impute
df.cd_tank_size.isna().sum()

1966

In [19]:
df_ts = df[['cd_make','cd_model','cd_tank_size']].sort_values(by='cd_tank_size').drop_duplicates(['cd_make','cd_model'])

df = pd.merge(df,df_ts,on=['cd_make','cd_model'])

df['cd_tanksize'] = df[['cd_tank_size_x','cd_tank_size_y']].apply(lambda x: x[0] if pd.notna(x[0]) else x[1],axis=1)



In [20]:
df['cd_tanksize'] = df.cd_tanksize.map(lambda x: get_tanksize(x))

In [21]:
#After Impute
df.cd_tanksize.isna().sum()

0

In [22]:
df = df[['cd_id','cd_mileage_miles','cd_year','cd_car_price','cd_make', 'cd_model', 'cd_body_style', 'cd_doors',
       'cd_engine', 'cd_transmission', 'cd_drive_type', 'cd_fuel',  'mpg_city', 'mpg_hwy', 'cd_tanksize']]

Label Encoding our Categorical Features

In [23]:


encoder_make = LabelEncoder()  # Set sparse=False to get a dense matrix

# Fit and transform the data
df['enc_make'] = encoder_make.fit_transform(df['cd_make'])



encoder_model = LabelEncoder()  # Set sparse=False to get a dense matrix

# Fit and transform the data
df['enc_model'] = encoder_model.fit_transform(df['cd_model'])
encoder_engine = LabelEncoder()  # Set sparse=False to get a dense matrix

# Fit and transform the data
df['enc_engine'] = encoder_engine.fit_transform(df['cd_engine'])
encoder_transmission = LabelEncoder()  # Set sparse=False to get a dense matrix

# Fit and transform the data
df['enc_transmission'] = encoder_transmission.fit_transform(df['cd_transmission'])
encoder_drive_type = LabelEncoder()  # Set sparse=False to get a dense matrix

# Fit and transform the data
df['enc_drive_type'] = encoder_drive_type.fit_transform(df['cd_drive_type'])
encoder_body_style = LabelEncoder()  # Set sparse=False to get a dense matrix

# Fit and transform the data
df['enc_body_style'] = encoder_body_style.fit_transform(df['cd_body_style'])

encoder_fuel = LabelEncoder()  # Set sparse=False to get a dense matrix

# Fit and transform the data
df['enc_make'] = encoder_fuel.fit_transform(df['cd_fuel'])

In [24]:
all_feats = df[['cd_id','cd_mileage_miles','cd_year',
                'cd_car_price','mpg_city', 'mpg_hwy',
                'cd_tanksize','enc_make', 'enc_model',
                'enc_engine', 'enc_transmission',
                'enc_drive_type', 'enc_body_style']]

In [25]:
al = all_feats.copy()

# Similarity Matrix
- Here we find the 10 most similar cars for any given car.
- The similarity metric using here is the cosine between the vectors.
- we end up with a n x 11 Matrix, where 
    - n : Number of cars in the inventory(rows of our dataset)
    - 11 columns : Car_id of a car and its 10 most similar cars to recommend.

In [26]:
similiarity = {} # hashmap to store similar cars
for v1 in tqdm(al.values):
    sim_ = []
    v1_type = v1[-1] #type of car 
    idx1 = v1[0] #id of car
    vx = v1[1:] # feature vector of car
    idx_sim = []
    for v2 in al[al.enc_body_style == v1_type].values: #all possible cars in inventory with same body style
        idx_sim.append(v2[0])
        vy = v2[1:]
        sim = cosine_similarity(v1.reshape(1,-1),v2.reshape(1,-1)) #cos-sim
        sim = sim[0][0]
        sim_.append(sim)
    top_10_indices = np.argsort(sim_)[::-1][1:11] # Top 10 sims
    top_10_indices = [idx_sim[i] for i in top_10_indices] #Save car id
    similiarity[idx1] = top_10_indices

100%|██████████| 2896/2896 [08:00<00:00,  6.03it/s]


In [27]:
sims = pd.DataFrame(similiarity).T # Transforming dict to useable dict
sims.columns = ['si_' + str(i) for i in sims.columns] # Renaming columns for db

sims['cd_id'] = sims.index
sims.reset_index(drop=True,inplace=True)

sims = sims[[sims.columns[-1]] + list(sims.columns[:-1])]

### Our Similarity Matrix finally looks like : 

In [28]:
sims

Unnamed: 0,cd_id,si_0,si_1,si_2,si_3,si_4,si_5,si_6,si_7,si_8,si_9
0,822.0,768.0,766.0,749.0,946.0,1529.0,1516.0,1481.0,1429.0,856.0,1156.0
1,823.0,774.0,1504.0,1428.0,911.0,1103.0,810.0,732.0,1818.0,1180.0,1460.0
2,824.0,1083.0,778.0,1102.0,746.0,1423.0,827.0,1394.0,1500.0,724.0,1403.0
3,1065.0,1030.0,834.0,1417.0,1057.0,1563.0,1536.0,945.0,758.0,1602.0,1298.0
4,1066.0,1322.0,1190.0,1035.0,1187.0,773.0,743.0,887.0,1318.0,958.0,783.0
...,...,...,...,...,...,...,...,...,...,...,...
2891,2882.0,2227.0,2367.0,2874.0,2356.0,2226.0,2324.0,2229.0,2231.0,2276.0,2888.0
2892,2883.0,2893.0,2310.0,2879.0,2878.0,2885.0,202.0,2256.0,2880.0,2288.0,2286.0
2893,2896.0,2830.0,2890.0,2263.0,2881.0,2872.0,2871.0,2875.0,2889.0,2357.0,2339.0
2894,2889.0,2871.0,2872.0,2357.0,2830.0,2896.0,2890.0,2263.0,2881.0,2875.0,2339.0


## Saving it to push the precomputed similarities to the database.

In [29]:
sims.to_csv('./data_out/similarity_matrix2.csv')