In [None]:
import numpy as np
import pandas as pd
import nltk
import re
from gensim.models.fasttext import FastText

# Download NLTK resources
nltk.download('wordnet')
nltk.download('omw-1.4')

# Extracting Text data from Yelp dataset
yelp_datafile = pd.read_json("/content/drive/MyDrive/yelp_academic_dataset_tip.json", lines=True)
print('List of all columns')
print(list(yelp_datafile))

# Subset data for gensim fastText model
all_sentences = list(yelp_datafile['text']) # select "text" column only
part_of_sentences = all_sentences[0:1000] # select first 1000 sample lines

# Defining lemmatizer object
lemmatizer = nltk.WordNetLemmatizer()


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


List of all columns
['user_id', 'business_id', 'text', 'date', 'compliment_count']


In [None]:
def process_text(review):
    review = re.sub(r'\s+', ' ', review, flags=re.I) # Remove extra white space from text
    review = re.sub(r'\W', ' ', str(review)) # Remove all the special characters from text
    review = re.sub(r'\s+[a-zA-Z]\s+', ' ', review) # Remove all single characters from text
    review = re.sub(r'[^a-zA-Z\s]', '', review) # Remove any character that isn't alphabetical
    review = review.lower() # Converting to Lowercase
    # Word tokenization
    tokens = review.split()
    # Applying lemmatization
    lemma_txt = [lemmatizer.lemmatize(word) for word in tokens]
    # Drop words less than 3 characters
    tokens = [word for word in tokens if len(word) > 3]
    return tokens

# Print 10 sentences before processing
print("Sentences Before Processing:")
for i in range(10):
    print(f"Sentence {i+1}: {part_of_sentences[i]}")

# Process the sentences
cleaned_reviews = [process_text(review) for review in part_of_sentences]

# Print 10 sentences after processing
print("\nSentences After Processing:")
for i in range(10):
    print(f"Sentence {i+1}: {' '.join(cleaned_reviews[i])}")


Sentences Before Processing:
Sentence 1: Avengers time with the ladies.
Sentence 2: They have lots of good deserts and tasty cuban sandwiches
Sentence 3: It's open even when you think it isn't
Sentence 4: Very decent fried chicken
Sentence 5: Appetizers.. platter special for lunch
Sentence 6: Chili Cup + Single Cheeseburger with onion, pickle, and relish + Vanilla Coca-Cola...so far.
Sentence 7: Saturday, Dec 7th 2013, ride Patco's Silver Sleigh w/ Santa & his elves on a decorated train into Center City. Trains leave from Lindenwold at 10am, 11:15am, & 12:30pm, and make all stops. Great for kids!
Sentence 8: This is probably the best place in the cool Springs area to watch a game and eat
Sentence 9: Tacos
Sentence 10: Starbucks substitute in boring downtown Tampa. Ugh. Never again!

Sentences After Processing:
Sentence 1: avengers time with ladies
Sentence 2: they have lots good deserts tasty cuban sandwiches
Sentence 3: open even when think
Sentence 4: very decent fried chicken
Senten

In [None]:
# Function to train FastText model with custom parameters
def train_Fasttext(sentences, embedding_size, window_size, min_word, down_sampling, Save_model_filename):
    fast_Text_model = FastText(sentences,
    vector_size=embedding_size,
    window=window_size,
    min_count=min_word,
    sample=down_sampling,
    workers = 4,
    sg=1,
    epochs=100)

    fast_Text_model.save(Save_model_filename)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# selected values for Training parameters
embedding_size = 300
window_size = 5
min_word = 5
down_sampling = 1e-2

train_Fasttext(cleaned_reviews, embedding_size, window_size, min_word, down_sampling, "Custom_FastText")

# Load saved gensim fastText model
fast_Text_model = FastText.load("Custom_FastText")

In [None]:
from gensim.models import Word2Vec

# Load saved gensim FastText model
fast_Text_model = Word2Vec.load("/content/Custom_FastText")

In [None]:
from gensim.models.fasttext import load_facebook_model
from gensim.models.fasttext import load_facebook_vectors

import gensim.downloader

# Load pre-trained Facebook FastText model
pretrained_model = gensim.downloader.load('fasttext-wiki-news-subwords-300')

In [None]:
from tabulate import tabulate

words = list(fast_Text_model.wv.key_to_index)  # Collect words from the model's vocabulary

for i in range(len(words)):
  if i % 10 == 0:  # Adjust this value if you want more or less frequent output
    print(f"Analyzing word: {words[i]}\n")

    # Getting top 10 similar and dissimilar words for custom model
    try:
      similar_words_custom = fast_Text_model.wv.most_similar(words[i], topn=10)
      opposite_words_custom = fast_Text_model.wv.most_similar(negative=[words[i]], topn=10)
    except KeyError:
      similar_words_custom = [("Word not found in vocabulary", 0.0)]
      opposite_words_custom = [("Word not found in vocabulary", 0.0)]

     #Getting top 10 similar and dissimilar words for pretrained model
    try:
      similar_words_pretrained = pretrained_model.most_similar(words[i], topn=10)
      opposite_words_pretrained = pretrained_model.most_similar(negative=[words[i]], topn=10)
    except KeyError:
      similar_words_pretrained = [("Word not found in vocabulary", 0.0)]
      opposite_words_pretrained = [("Word not found in vocabulary", 0.0)]

    # Creating tables for each category
    table_custom_similar = tabulate(similar_words_custom, headers=['Similar Word', 'Similarity'], tablefmt='github')
    table_custom_opposite = tabulate(opposite_words_custom, headers=['Opposite Word', 'Similarity'], tablefmt='github')
    table_pretrained_similar = tabulate(similar_words_pretrained, headers=['Similar Word', 'Similarity'], tablefmt='github')
    table_pretrained_opposite = tabulate(opposite_words_pretrained, headers=['Opposite Word', 'Similarity'], tablefmt='github')

    # Printing the tables
    print("Top 10 similar words (custom model):")
    print(table_custom_similar)
    print("\nTop 10 opposite words (custom model):")
    print(table_custom_opposite)
    print("\nTop 10 similar words (pre-trained model):")
    print(table_pretrained_similar)
    print("\nTop 10 opposite words (pre-trained model):")
    print(table_pretrained_opposite)
    print("\n" + "-"*40 + "\n")  # Separator for readability

Analyzing word: great

Top 10 similar words (custom model):
| Similar Word   |   Similarity |
|----------------|--------------|
| highly         |     0.460687 |
| large          |     0.460267 |
| local          |     0.460056 |
| start          |     0.434377 |
| choice         |     0.424562 |
| server         |     0.410815 |
| fantastic      |     0.410757 |
| prices         |     0.407296 |
| pickles        |     0.406203 |
| cuban          |     0.404121 |

Top 10 opposite words (custom model):
| Opposite Word   |   Similarity |
|-----------------|--------------|
| check           |   0.0904044  |
| store           |   0.022808   |
| business        |   0.00651356 |
| should          |  -0.00163129 |
| work            |  -0.0112569  |
| would           |  -0.0160653  |
| just            |  -0.0202616  |
| yelp            |  -0.0249961  |
| looking         |  -0.0292841  |
| friday          |  -0.03289    |

Top 10 similar words (pre-trained model):
| Similar Word   |   Similarit