In [2]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

In [3]:
path = "../data/final_car_reviews.csv"
cars = pd.read_csv(path)
cars.dropna(subset=["Review"], inplace=True)
cars.head(2)

Unnamed: 0,Vehicle_Title,Review_Title,Review,Rating
0,2007 Volkswagen New Beetle Convertible 2.5 2dr...,"New Beetle- Holds up well & Fun to Drive, but ...",beetle convertible 45 year andhave overall hap...,4.5
1,2007 Volkswagen New Beetle Convertible 2.5 PZE...,Quality Review,bought car new 2007 generally satisfied mechan...,4.5


In [5]:
reviews_list = cars["Review_Title"].tolist()

In [6]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
def get_recommendations_with_bert(input_text, car_reviews):
    query_tokens = tokenizer.encode(input_text, add_special_tokens=True, return_tensors="pt")
    car_reviews_tokens = tokenizer.batch_encode_plus(car_reviews, add_special_tokens=True, return_tensors="pt", padding=True)["input_ids"]

    with torch.no_grad():
        query_embedding = model(query_tokens)[0][:, 0, :].numpy()
        car_embeddings = model(car_reviews_tokens)[0][:, 0, :].numpy()

    similarity_scores = cosine_similarity(query_embedding, car_embeddings)

    ranked_indices = similarity_scores.argsort()[0][::-1]
    top_recommendations = [car_reviews[i] for i in ranked_indices]

    print("Top recommendations based on input search:")
    for idx, recommendation in enumerate(top_recommendations, start=1):
        print(f"{idx}. {recommendation}")

In [None]:
input_text = "I want a big family car"
get_recommendations_with_bert(input_text, reviews_list)