In [1]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import torch
import math

In [2]:
cars = pd.read_csv("../data/final_car_reviews.csv")
cars.dropna(subset=["Review"], inplace=True)
cars.head(2)

Unnamed: 0,Vehicle_Title,Review_Title,Review,Rating
0,2007 Volkswagen New Beetle Convertible 2.5 2dr...,"New Beetle- Holds up well & Fun to Drive, but ...",beetle convertible 45 year andhave overall hap...,4.5
1,2007 Volkswagen New Beetle Convertible 2.5 PZE...,Quality Review,bought car new 2007 generally satisfied mechan...,4.5


In [3]:
reviews_list = cars["Review_Title"].tolist()

In [10]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [5]:
# if torch.cuda.is_available():
#     print("GPU is available.")
#     device = torch.device("cuda")  # Use GPU
#     print("GPU device:", torch.cuda.get_device_name(0))  # Print GPU name
# else:
#     print("GPU is not available. Switching to CPU.")
#     device = torch.device("cpu")  # Use CPU

GPU is available.
GPU device: Tesla T4


In [None]:
# model.to(device)

In [11]:
def tokenize_in_batches(texts, batch_size):
    tokenized_inputs = []
    num_batches = math.ceil(len(texts) / batch_size)

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(texts))
        chunk = texts[start_idx:end_idx]

        inputs = tokenizer.batch_encode_plus(
            chunk,
            add_special_tokens=True,
            return_tensors="pt",
            padding=True
        )["input_ids"]
        tokenized_inputs.append(inputs)

    return tokenized_inputs

batched_inputs = tokenize_in_batches(reviews_list, batch_size=8)

In [None]:
def generate_embeddings(batched_inputs, model):
    all_embeddings = []

    for batch_inputs in batched_inputs:
        with torch.no_grad():
            # inputs_gpu = {key: value.to("cuda") for key, value in batch_inputs.items()}
            embeddings = model(batch_inputs)[0][:, 0, :].numpy()
    all_embeddings.append(embeddings)

    return all_embeddings

# Generate embeddings for batched inputs using the model
embeddings_per_batch = generate_embeddings(batched_inputs, model)

In [None]:
def get_recommendations_with_bert(input_text, car_reviews):
    query_tokens = tokenizer.encode(input_text, add_special_tokens=True, return_tensors="pt")
    with torch.no_grad():
        query_embedding = model(query_tokens)[0][:, 0, :].numpy()

    similarity_scores = cosine_similarity(query_embedding, embeddings_per_batch)

    ranked_indices = similarity_scores.argsort()[0][::-1]
    top_recommendations = [car_reviews[i] for i in ranked_indices]

    print("Top recommendations based on input search:")
    for idx, recommendation in enumerate(top_recommendations, start=1):
        print(f"{idx}. {recommendation}")

In [None]:
input_text = "I want a big family car"
get_recommendations_with_bert(input_text, reviews_list)