In [1]:
import nltk
import pandas as pd
from gensim.models import Word2Vec
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from itertools import combinations
import numpy as np

In [2]:
df = pd.read_csv('nlp_papers.csv')
df['abstract'] = df['abstract'].fillna('')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenize
    tokens = word_tokenize(text.lower())
    # Lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha()]
    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)


df['processed_text'] = df['abstract'].apply(preprocess_text)
sentences = df['processed_text'].apply(lambda x: x.split()).tolist()

# Step 1: Train Word2Vec models with different vector sizes

In [3]:
vector_sizes = [50, 100, 150, 200, 250, 300]
window_size = 5
models = {}

for vector_size in vector_sizes:
    model = Word2Vec(sentences=sentences, vector_size=vector_size, window=window_size, min_count=5, workers=4)
    models[vector_size] = model
    model.save(f'word2vec_vector_size_{vector_size}.model')

# Step 2: Define a function to calculate cosine similarity

In [4]:
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Step 3: For each model, find the top 10 word pairs with the highest similarity

In [None]:
top_pairs_results = {}

for vector_size, model in models.items():
    word_pairs = list(combinations(model.wv.index_to_key, 2))  # Generate all possible word pairs
    similarities = []

    for word1, word2 in word_pairs:
        vec1 = model.wv[word1]
        vec2 = model.wv[word2]
        similarity = cosine_similarity(vec1, vec2)
        similarities.append((word1, word2, similarity))

    # Sort the pairs by similarity and get the top 20
    top_pairs = sorted(similarities, key=lambda x: x[2], reverse=True)[:20]
    top_pairs_results[vector_size] = top_pairs

# Step 4: Display the results

In [None]:
for vector_size, top_pairs in top_pairs_results.items():
    print(f"\nTop 20 word pairs with highest similarity for vector size {vector_size} and window size {window_size}:")
    for word1, word2, similarity in top_pairs:
        print(f"{word1} - {word2}: {similarity:.4f}")