In [7]:
import pandas as pd
import numpy as np
import torch
from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
# model = SentenceTransformer('model/multiple_negatives')
rake = Rake()

data = pd.read_csv("data_science_questions.csv")
data = data.dropna(subset=["Topic"])

question = "What is backpropagation?"
reference_answers = data[data["Question"] == question]["Answer"].str.lower().tolist()
topic = data[data["Question"] == question]["Topic"].values[0]
topic = topic.lower().split(",")
# topic = data['Topic'].tolist()
user_answer = "Backpropagation is a way of determining the structure of a neural network by deciding the number of layers and nodes. It is unrelated to optimizing weights or reducing error after the model is trained."
user_answer = user_answer.lower()

def extract_keywords(text, min_length=2, max_length=6):
    rake.extract_keywords_from_text(text)
    keywords = rake.get_ranked_phrases()
    keywords = [keyword for keyword in keywords if len(keyword.split()) >= min_length and len(keyword.split()) <= max_length]
    return keywords

def get_keyword_embeddings(keywords):
    if not keywords: 
        return torch.zeros((1, model.get_sentence_embedding_dimension()))  
    return model.encode(keywords, convert_to_tensor=True)

def aggregate_embeddings(embeddings):
    if embeddings.size(0) == 0:
        return torch.zeros(model.get_sentence_embedding_dimension())
    return embeddings.mean(dim=0)

def preprocess_reference_answers(reference_answers):
    for t in topic:
        reference_answers = [answer.replace(t, "") for answer in reference_answers]
    return reference_answers

def preprocess_user_answer(user_answer):
    for t in topic:
        user_answer = user_answer.replace(t, "")
    return user_answer

def calculate_keyword_similarity(user_answer, reference_answers):
    user_keywords = extract_keywords(user_answer)
    reference_keywords_list = [extract_keywords(answer) for answer in reference_answers]

    user_embeddings = get_keyword_embeddings(user_keywords)
    reference_embeddings = [get_keyword_embeddings(keywords) for keywords in reference_keywords_list]

    user_embedding = aggregate_embeddings(user_embeddings)
    reference_embeddings = [aggregate_embeddings(ref_embeddings) for ref_embeddings in reference_embeddings]

    similarities = []
    for ref_embedding in reference_embeddings:
        similarity = cosine_similarity(user_embedding.unsqueeze(0).cpu().numpy(), ref_embedding.unsqueeze(0).cpu().numpy())
        similarities.append(similarity[0][0])
    return similarities

def calculate_similarity(user_answer, reference_answers):
    user_embedding = model.encode(user_answer)
    reference_embeddings = model.encode(reference_answers)

    similarities = cosine_similarity([user_embedding], reference_embeddings)
    return similarities[0]

reference_answers = preprocess_reference_answers(reference_answers)
user_answer = preprocess_user_answer(user_answer)
print(extract_keywords(user_answer))
similarity = max(calculate_similarity(user_answer, reference_answers))
keyword_similarity = max(calculate_keyword_similarity(user_answer, reference_answers))
print(similarity)
print(keyword_similarity)
print(similarity * 0.5 + keyword_similarity * 0.5)

['reducing error', 'optimizing weights', 'neural network']
0.6887084
0.7833654
0.7360369265079498


In [2]:
import nltk

In [2]:
import pandas as pd
data = pd.read_csv("data_science_questions.csv")
question = "What is the exploration-exploitation dilemma in reinforcement learning?"
data.loc[data['Question'] == question, 'Topic'] = "Reinforcement Learning,exploration-exploitation dilemma"

In [None]:
data.to_csv("data_science_questions.csv", index=False)

In [None]:
import sentence_transformers
print(sentence_transformers.__version__)

2.2.2
