In [17]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [65]:
# Encode input text and get BERT embeddings
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling over tokens
    return embeddings

In [66]:
# Calculate cosine similarity between two vectors
def cosine_similarity_vec(vec1, vec2):
    return cosine_similarity(vec1.detach().numpy(), vec2.detach().numpy())


In [67]:
# Example usage
input_text = "I like to eat mac and cheese"
input_embedding = get_bert_embedding(input_text)


In [68]:
dataset = ["I don't like to eat mac and cheese", "What is it?",
           "I like to eat cheese", "I like to eat mac and cheese"]

In [69]:
# Calculate cosine similarity with other items
similarities = {}
for other_item in dataset:
    other_embedding = get_bert_embedding(other_item)
    similarity_score = cosine_similarity_vec(input_embedding, other_embedding)
    similarities[other_item] = similarity_score


In [70]:
# Get top N recommendations based on cosine similarity
top_recommendations = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:4]
print("Top recommendations:", top_recommendations)

Top recommendations: [('I like to eat mac and cheese', array([[1.]], dtype=float32)), ("I don't like to eat mac and cheese", array([[0.9144159]], dtype=float32)), ('I like to eat cheese', array([[0.8895547]], dtype=float32)), ('What is it?', array([[0.46155483]], dtype=float32))]


In [83]:
# Define weights for criteria
weights = {
    'gender': 0.6,
    'actor': 0.4,
    # Add more criteria and adjust weights accordingly
}

In [95]:
# Example criteria values for the input
input_criteria = {
    'gender': 'Male',
    'actor': 'Tom Hanks',
    # Add more criteria values as needed
}

In [96]:
dataset = [{'gender': 'Female', 'actor': 'Tom Hanks', 'description':'I like mac and cheese'},
           {'gender': 'Male', 'actor': 'Tom Hanks', 'description':'I like cheese'},
           {'gender': 'Male', 'actor': 'Tom Hanks', 'description':'What is is?'}]

In [97]:
adjusted_similarities = {}
for item in dataset:
    text = item['description']  # Extract text from the dictionary
    other_embedding = get_bert_embedding(text)
    similarity_score = cosine_similarity_vec(input_embedding, other_embedding)

    # Apply adjustments based on criteria
    adjusted_score = similarity_score
    for criterion, weight in weights.items():
        if criterion in input_criteria and input_criteria[criterion] == item.get(criterion):
            adjusted_score += weight * 0.1  # Adjust by a factor based on criterion importance

    adjusted_similarities[text] = adjusted_score  # Store similarity scores by text


In [98]:
# Get top N recommendations based on adjusted similarity scores
top_recommendations = sorted(adjusted_similarities.items(), key=lambda x: x[1], reverse=True)[:3]
print("Top recommendations:", top_recommendations)

Top recommendations: [('I like mac and cheese', array([[0.97068244]], dtype=float32)), ('I like cheese', array([[0.8526221]], dtype=float32)), ('What is is?', array([[0.5788031]], dtype=float32))]
