In [5]:
import json
import pandas as pd
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
import os

# Assuming your dataset file is located at ""
data_file_path = "/content/AllCombined.txt"

# Load data from the dataset file
with open(data_file_path, "r", encoding="utf-8") as f:
    data = f.readlines()

# Convert data to DataFrame
df = pd.DataFrame(data, columns=["reviewText"])

# Tokenize the reviewText
reviewText = df.reviewText.apply(gensim.utils.simple_preprocess)

# Train the Word2Vec model
model = Word2Vec(
    sentences=reviewText,
    vector_size=100,
    window=10,
    min_count=2,
    workers=4,
    sorted_vocab=False
)

# Specify the directory to save the model
save_dir = "/content/drive/My Drive/"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Save the trained model
model_path = os.path.join(save_dir, "word2vec_model.bin")
model.save(model_path)

# Load the saved model
model_saved = Word2Vec.load(model_path)

# Example of finding similar words
similar_words = model_saved.wv.most_similar("sports")
print("Similar words to 'sports':", similar_words)

# Example of calculating similarity between words
similarity_score = model_saved.wv.similarity(w1="human", w2="computer")
print("Similarity score between 'human' and 'computer':", similarity_score)

similarity_score = model_saved.wv.similarity(w1="machine", w2="computer")
print("Similarity score between 'machine' and 'computer':", similarity_score)



Similar words to 'sports': [('sport', 0.8117619156837463), ('sporting', 0.7387698292732239), ('competitions', 0.7132552266120911), ('cycling', 0.6727641224861145), ('athletic', 0.6577788591384888), ('golf', 0.6553894281387329), ('espn', 0.6523968577384949), ('soccer', 0.6372227072715759), ('clubs', 0.6343850493431091), ('tournaments', 0.62979656457901)]
Similarity score between 'human' and 'computer': 0.26603863
Similarity score between 'machine' and 'computer': 0.643731
