In [17]:
import pandas as pd
import numpy as np
from gensim.models import FastText
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string
import gensim.downloader as api

nltk.download('stopwords')
nltk.download('wordnet')

# Initialize WordNet lemmatizer and English stopwords
stemmer = WordNetLemmatizer()
en_stop = set(stopwords.words('english'))

# Read the Training Data
tips_df = pd.read_json('/content/drive/MyDrive/yelp/yelp_academic_dataset_tip.json', lines=True)
text_data = tips_df['text']

# Select a Subset of Words from the Training Data
subset_data = text_data.sample(n=1000)

# Preprocessing step
def preprocess_text(document):
    document = re.sub(r'\s+', ' ', document, flags=re.I) # Remove extra whitespaces
    document = re.sub(r'\W', ' ', str(document)) # Remove non-alphanumeric characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document) # Remove single characters
    document = document.lower()
    tokens = document.split()
    tokens = [stemmer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if word not in en_stop]
    tokens = [word for word in tokens if len(word) > 3]  # Filter out short words
    return tokens

# Preprocess the Text Data
corpus = [preprocess_text(text) for text in subset_data]

# function to Trains a FastText model on the given corpus
def train_fasttext_model(corpus, vector_size=100, window=5, min_count=1, workers=4):
    model = FastText(sentences=corpus, vector_size=vector_size, window=window, min_count=min_count, workers=workers)
    return model

# Train FastText Model
model = train_fasttext_model(corpus)

# Choose 20 Random Words from the Training Data
random_words = np.random.choice(np.concatenate(corpus), size=20, replace=False)

# Get Embeddings for Random Words
embeddings = [model.wv[word] for word in random_words if word in model.wv]

# function to Finds the nearest and farthest words to the embeddings in the trained FastText model
def get_nearest_and_farthest_words(model, embeddings, n=10):
    nearest_words = []
    farthest_words = []

    for embedding in embeddings:
        similarities = cosine_similarity([embedding], model.wv.vectors)
        nearest_indices = np.argsort(similarities[0])[-(n+1):-1]  # Exclude the word itself
        farthest_indices = np.argsort(similarities[0])[:n]

        nearest_words.append([model.wv.index_to_key[idx] for idx in nearest_indices])
        farthest_words.append([model.wv.index_to_key[idx] for idx in farthest_indices])

    return nearest_words, farthest_words

# Find Nearest and Farthest Words
nearest_words, farthest_words = get_nearest_and_farthest_words(model, embeddings)

# Load Pretrained FastText Model
pretrained_model = api.load('fasttext-wiki-news-subwords-300')

# function to Finds the most similar and dissimilar words to the random words in the pretrained FastText model.
def get_pretrained_nearest_and_farthest_words(pretrained_model, random_words, n=10):
    pretrained_nearest_words = []
    pretrained_farthest_words = []

    for word in random_words:
        if word in pretrained_model:  # Check if word is in vocabulary
            nearest = pretrained_model.most_similar(word, topn=n)
            farthest = pretrained_model.most_similar(negative=[word], topn=n)
            pretrained_nearest_words.append([w[0] for w in nearest])
            pretrained_farthest_words.append([w[0] for w in farthest])
        else:
            pretrained_nearest_words.append([])
            pretrained_farthest_words.append([])

    return pretrained_nearest_words, pretrained_farthest_words

# Get most similar and dissimilar words for each random word
pretrained_nearest_words, pretrained_farthest_words = get_pretrained_nearest_and_farthest_words(pretrained_model, random_words)

# Print the Results
for idx, word in enumerate(random_words):
    print(f"Random Word: {word}")
    print("Trained Model - Nearest Words:", nearest_words[idx])
    print("Trained Model - Farthest Words:", farthest_words[idx])
    print("Pretrained Model - Most Similar Words:", pretrained_nearest_words[idx])
    print("Pretrained Model - Most Dissimilar Words:", pretrained_farthest_words[idx])
    print("\n")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Random Word: scratch
Trained Model - Nearest Words: ['tapping', 'staying', 'hopping', 'redding', 'waiting', 'passing', 'free', 'pudding', 'disgusting', 'blocking']
Trained Model - Farthest Words: ['idaho', 'issue', 'zodiac', 'olga', 'http', 'ravs', '5llmmaa', '45pm', 'angus', 'wifi']
Pretrained Model - Most Similar Words: ['scratching', 'scratched', 'scratcher', 'scratch.', 'scratches', 'scratchin', 'from-scratch', 'scratchiness', 'scratchers', 'scratch-free']
Pretrained Model - Most Dissimilar Words: ['ಹೊಸ', 'قوی', 'تصل', 'சர', '၌', 'قرب', 'रष', 'हू', 'യത', 'கன']


Random Word: said
Trained Model - Nearest Words: ['anyone', 'bothered', 'skating', 'situation', 'eater', 'leftover', 'hairdresser', 'young', 'manager', 'salmon']
Trained Model - Farthest Words: ['2013', 'iozzo', 'kabob', '2017', 'http', 'nudie', 'asset', 'text', 'html', '2018']
Pretrained Model - Most Similar Words: ['says', 'told', '-said', 'said--and', 'said.One', 'remarked', 'noted', 'argued', 'said.But', 'wrote']
Pretra