In [19]:
import json
import pandas as pd
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
import os

# Download NLTK data for tokenization
nltk.download('punkt')

# Assuming your dataset file is located at ""
data_file_path = "/content/AllCombined.txt"

# Load data from the dataset file
with open(data_file_path, "r", encoding="utf-8") as f:
    data = f.readlines()

# Convert data to DataFrame
df = pd.DataFrame(data, columns=["reviewText"])

# Tokenize the reviewText
reviewText = df.reviewText.apply(word_tokenize)

# Initialize Word2Vec model without building the vocabulary
model = Word2Vec(
    vector_size=100,
    window=10,
    min_count=2,
    workers=4,
    sorted_vocab=False
)

# Build the vocabulary
model.build_vocab(reviewText)

# Train the Word2Vec model
model.train(reviewText, total_examples=len(reviewText), epochs=10)

# Specify the directory to save the model
save_dir = "/content/drive/My Drive/"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Save the trained model
model_path = os.path.join(save_dir, "word2vec_model.bin")
model.save(model_path)

# Load the saved model
model_saved = Word2Vec.load(model_path)

# Example of finding similar words
similar_words = model_saved.wv.most_similar("sports")
print("Similar words to 'sports':", similar_words)

# Example of calculating similarity between words
similarity_score = model_saved.wv.similarity(w1="human", w2="computer")
print("Similarity score between 'human' and 'computer':", similarity_score)

similarity_score = model_saved.wv.similarity(w1="machine", w2="computer")
print("Similarity score between 'machine' and 'computer':", similarity_score)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Similar words to 'sports': [('sport', 0.778705358505249), ('athletics', 0.7702298760414124), ('volleyball', 0.7701297402381897), ('competitions', 0.7684786915779114), ('cycling', 0.7619398832321167), ('clubs', 0.7479984164237976), ('soccer', 0.7381991744041443), ('stadiums', 0.7299372553825378), ('basketball', 0.7129060626029968), ('cricket', 0.7100526094436646)]
Similarity score between 'human' and 'computer': 0.3290643
Similarity score between 'machine' and 'computer': 0.71950233
