In [4]:
# Importing necessary libraries
import pandas as pd
import numpy as np

In [5]:
from google.colab import drive
drive.mount('/content/drive')

vehicle_data_path = '/content/drive/My Drive/vehicle_dataset/final.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Now that the data has been cleaned, the recommender system can be engaged. This will be a contenet-based recommender system where recommendations are based on the similarity of the vehicle features and not on explicit (user preference or profile) user metadata.


In [6]:
# data load
car_data = pd.read_csv(vehicle_data_path)
car_data.head(5)

Unnamed: 0,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state,price
0,24.0,29.0,0.0,4.0,2.0,284000.0,0.0,0.0,2.0,2.0,0.0,9.0,4.0,4000
1,21.0,6.0,2.0,6.0,2.0,284000.0,0.0,0.0,1.0,1.194993,5.0,1.0,40.0,2400
2,21.0,8.0,1.291587,5.891416,0.0,284000.0,0.0,0.0,0.726873,1.0,0.0,10.0,38.0,8900
3,10.0,8.0,2.0,5.891416,2.0,284000.0,0.0,0.0,0.726873,1.194993,5.385383,5.612094,14.0,7988
4,10.0,8.0,1.291587,5.891416,2.0,284000.0,0.0,0.0,0.726873,1.194993,5.385383,5.612094,48.0,7988


In [7]:
# calculateing popularity by manufacturer
def calc_popularity(column_name, data):
    # Calculate popularity score
    popularity_score = data[column_name].value_counts(normalize=True).reset_index()
    popularity_score.columns = ['manufacturer', 'popularity_ranking']
    popularity_score['popularity_ranking'] = popularity_score.index + 1  # Add 1 to start ranking from 1
    return popularity_score

In [8]:
# Calculate popularity score for 'manufacturer' column
popularity = calc_popularity('manufacturer', car_data)
popularity.head()

Unnamed: 0,manufacturer,popularity_ranking
0,8.0,1
1,5.0,2
2,29.0,3
3,10.0,4
4,22.0,5


In [9]:
# merge the original dataframe with the new popularity score data frame, to get new column as popularity score
car_data=pd.merge(car_data,popularity,on='manufacturer',how='inner')
car_data.head()

Unnamed: 0,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state,price,popularity_ranking
0,24.0,29.0,0.0,4.0,2.0,284000.0,0.0,0.0,2.0,2.0,0.0,9.0,4.0,4000,3
1,30.0,29.0,2.0,4.0,2.0,284000.0,0.0,0.0,0.726873,1.194993,5.385383,5.612094,3.0,1500,3
2,17.0,29.0,2.0,6.0,3.0,284000.0,0.0,0.0,0.726873,1.194993,0.0,9.0,17.0,4500,3
3,16.0,29.0,0.0,4.0,2.0,283000.0,0.0,0.0,1.0,1.0,4.0,1.0,15.0,3900,3
4,24.0,29.0,2.0,4.0,2.0,283000.0,0.0,0.0,1.0,2.0,8.0,3.0,4.0,750,3


In [10]:
# calculateing popularity by manufacturer
def recommend_cars(price):
    # Filter dataset for cars within the given price range
    filtered_car_data = car_data[car_data['price'] <= price]

    # Group by manufacturer and calculate average popularity score
    popularity_scores_filtered = filtered_car_data.groupby('manufacturer').size() / len(filtered_car_data)

    # Sort by popularity score and get top 3 manufacturers
    top_manufacturers = popularity_scores_filtered.sort_values(ascending=False).head(3).index.tolist()

    # Get top car for each manufacturer
    recommendations = []
    for manufacturer in top_manufacturers:
        top_car = filtered_car_data[filtered_car_data['manufacturer'] == manufacturer].head(1)
        recommendations.append(top_car[['manufacturer', 'price', 'popularity_ranking']].to_dict('records')[0])

    return recommendations


In [11]:
# Example usage
user_price = 10000
recommended_cars = recommend_cars(user_price)
print("Top 3 recommended cars within $", user_price, ":\n", recommended_cars)

Top 3 recommended cars within $ 10000 :
 [{'manufacturer': 8.0, 'price': 8900, 'popularity_ranking': 1}, {'manufacturer': 5.0, 'price': 4800, 'popularity_ranking': 2}, {'manufacturer': 29.0, 'price': 4000, 'popularity_ranking': 3}]


In [29]:
from sklearn.metrics.pairwise import cosine_similarity #cosine_similarity function is used to check the similarity in the

# define function which will return recomended cars
def recommend(manufacturer, paint_color, car_type, price_range):
    '''
    data set: car_data
    parameters: manufacturer, paint_color, car_type, price_range
    return: dataframe containing the top similar cars
    '''

    # filter data
    data = car_data.loc[
        (car_data['paint_color'] == paint_color) &
        (car_data['type'] == car_type) &
        ((car_data['price'] >= price_range[0]) & (car_data['price'] <= price_range[1]))
    ]
    data.reset_index(level=0, inplace=True)

    # Convert the index into a series
    indices = pd.Series(data.index, index=data['manufacturer'])

    # calculating the similarity measure using cosine_similarity
    sg = cosine_similarity(data[['year', 'condition', 'cylinders', 'fuel', 'odometer', 'title_status',
                                  'transmission', 'drive', 'size', 'popularity_ranking']])

    # Get the index corresponding to the original manufacturer
    idx = indices[manufacturer]

    # Get the pairwise similarity scores
    sig = list(enumerate(sg[idx]))

    # Sort the cars
    sig = sorted(sig, reverse=True)

    # Scores of the 6 most similar cars
    sig = sig[0:6]

    # Car indices
    car_indices = [i[0] for i in sig]

    # Top 6 car recommendations
    recommendations = data[['price', 'manufacturer', 'type', 'year', 'condition', 'fuel', 'title_status',
                            'transmission', 'paint_color', 'state']].iloc[car_indices]
    return recommendations


In [30]:
recommend(manufacturer=29.0,paint_color=9.000000,car_type=0.000000,price_range=(5000,10000))

Unnamed: 0,price,manufacturer,type,year,condition,fuel,title_status,transmission,paint_color,state
233,8495,29.0,0.0,15.0,3.0,2.0,0.0,0.0,9.0,21.0
232,8495,29.0,0.0,15.0,0.0,2.0,0.0,0.0,9.0,39.0
231,8995,29.0,0.0,15.0,0.0,2.0,0.0,0.0,9.0,5.0
230,7950,29.0,0.0,15.0,2.0,2.0,0.0,0.0,9.0,27.0
229,7800,29.0,0.0,13.0,2.0,2.0,0.0,0.0,9.0,10.0
228,7200,29.0,0.0,23.0,3.0,2.0,0.0,0.0,9.0,11.0
