In [None]:
#Word2Vec

import pandas as pd
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')


df = pd.read_csv('')


tokenized_definitions = [word_tokenize(definition.lower()) for definition in df['first_definition'].append(df['second_definition'])]


model = Word2Vec(sentences=tokenized_definitions, vector_size=300, window=2, min_count=1, workers=4)


def calculate_similarity(definition1, definition2):
    tokens_definition1 = word_tokenize(definition1.lower())
    tokens_definition2 = word_tokenize(definition2.lower())


    tokens_definition1 = [token for token in tokens_definition1 if token in model.wv.key_to_index]
    tokens_definition2 = [token for token in tokens_definition2 if token in model.wv.key_to_index]

    if tokens_definition1 and tokens_definition2:
        embedding_definition1 = model.wv[tokens_definition1].mean(axis=0)
        embedding_definition2 = model.wv[tokens_definition2].mean(axis=0)


        similarity = cosine_similarity([embedding_definition1], [embedding_definition2])[0][0]
        return similarity
    else:
        return 0.0


similarities = []

for index, row in df.iterrows():
    definition1 = row['first_definition']
    definition2 = row['second_definition']
    definition3 = row['third_definition']
    definition4 = row['fourth_definition']


    similarity_1_2 = calculate_similarity(definition1, definition2)
    similarity_2_3 = calculate_similarity(definition2, definition3)
    similarity_3_4 = calculate_similarity(definition3, definition4)

    similarities.append((similarity_1_2, similarity_2_3, similarity_3_4))


df['similarity_1_2'], df['similarity_2_3'], df['similarity_3_4'] = zip(*similarities)


print(df[['word', 'similarity_1_2', 'similarity_2_3', 'similarity_3_4']])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


          word  similarity_1_2  similarity_2_3  similarity_3_4
0         Nice        0.365742        0.588511        1.000000
1         Girl        0.258573        0.336131        0.021396
2          Gay        0.021771        0.017718        0.000000
3        Awful       -0.035649       -0.042638        0.595060
4  Manufacture        0.723256        0.233119        0.423240
5      Villain        0.333270        0.262052        0.641448
6         Meat        0.279764        0.692848        0.615773
7        Silly        0.682956        0.583736       -0.067588
8       Cursor        0.629800        0.599075        1.000000
9          Guy        0.295790        0.055928       -0.027981


  tokenized_definitions = [word_tokenize(definition.lower()) for definition in df['first_definition'].append(df['second_definition'])]


In [None]:
#BERT

import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


df = pd.read_csv('')  # Update with your actual file path


def calculate_similarity(definition1, definition2):
    tokens_definition1 = tokenizer(definition1, return_tensors="pt")
    outputs_definition1 = model(**tokens_definition1)
    embedding_definition1 = outputs_definition1.last_hidden_state.mean(dim=1).detach().numpy()

    tokens_definition2 = tokenizer(definition2, return_tensors="pt")
    outputs_definition2 = model(**tokens_definition2)
    embedding_definition2 = outputs_definition2.last_hidden_state.mean(dim=1).detach().numpy()

    similarity = cosine_similarity(embedding_definition1, embedding_definition2)[0][0]
    return similarity


similarities = []

for index, row in df.iterrows():
    definition1 = row['first_definition']
    definition2 = row['second_definition']
    definition3 = row['third_definition']
    definition4 = row['fourth_definition']


    similarity_1_2 = calculate_similarity(definition1, definition2)
    similarity_2_3 = calculate_similarity(definition2, definition3)
    similarity_3_4 = calculate_similarity(definition3, definition4)

    similarities.append((similarity_1_2, similarity_2_3, similarity_3_4))

df['similarity_1_2'], df['similarity_2_3'], df['similarity_3_4'] = zip(*similarities)


print(df[['word', 'similarity_1_2', 'similarity_2_3', 'similarity_3_4']])


In [None]:
#GloVe
import pandas as pd
import spacy


nlp = spacy.load('en_core_web_md')


df = pd.read_csv('')  # Update with your actual file path


def calculate_similarity(definition1, definition2):
    embedding_definition1 = nlp(definition1).vector
    embedding_definition2 = nlp(definition2).vector


    similarity = cosine_similarity([embedding_definition1], [embedding_definition2])[0][0]
    return similarity


similarities = []

for index, row in df.iterrows():
    definition1 = row['first_definition']
    definition2 = row['second_definition']
    definition3 = row['third_definition']
    definition4 = row['fourth_definition']


    similarity_1_2 = calculate_similarity(definition1, definition2)
    similarity_2_3 = calculate_similarity(definition2, definition3)
    similarity_3_4 = calculate_similarity(definition3, definition4)

    similarities.append((similarity_1_2, similarity_2_3, similarity_3_4))

df['similarity_1_2'], df['similarity_2_3'], df['similarity_3_4'] = zip(*similarities)


print(df[['word', 'similarity_1_2', 'similarity_2_3', 'similarity_3_4']])
