In [8]:
! pip install reportlab

Collecting reportlab
  Downloading reportlab-4.2.0-py3-none-any.whl.metadata (1.4 kB)
Collecting chardet (from reportlab)
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading reportlab-4.2.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading chardet-5.2.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.4/199.4 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: chardet, reportlab
Successfully installed chardet-5.2.0 reportlab-4.2.0


In [9]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from gensim.models.fasttext import FastText 
from gensim.models import Word2Vec 
from gensim.models.fasttext import load_facebook_model
from tabulate import tabulate
import random
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet

In [10]:
# load the pre-trained model 
! wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
! gunzip "cc.en.300.bin.gz"

--2024-04-19 16:52:12--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.164.78.128, 18.164.78.121, 18.164.78.72, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.164.78.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4503593528 (4.2G) [application/octet-stream]
Saving to: 'cc.en.300.bin.gz'


2024-04-19 16:52:29 (259 MB/s) - 'cc.en.300.bin.gz' saved [4503593528/4503593528]



In [11]:
nltk.download('wordnet', "/kaggle/working/nltk_data/")
nltk.download('omw-1.4', "/kaggle/working/nltk_data/")
! unzip /kaggle/working/nltk_data/corpora/wordnet.zip -d /kaggle/working/nltk_data/corpora
! unzip /kaggle/working/nltk_data/corpora/omw-1.4.zip -d /kaggle/working/nltk_data/corpora

nltk.data.path.append("/kaggle/working/nltk_data/")

[nltk_data] Downloading package wordnet to
[nltk_data]     /kaggle/working/nltk_data/...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /kaggle/working/nltk_data/...
Archive:  /kaggle/working/nltk_data/corpora/wordnet.zip
   creating: /kaggle/working/nltk_data/corpora/wordnet/
  inflating: /kaggle/working/nltk_data/corpora/wordnet/lexnames  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/data.verb  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/index.adv  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/adv.exc  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/index.verb  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/data.adj  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/index.adj  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/LICENSE  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/citation.bib  
  inflating: /kaggle/working/nltk_data/c

In [12]:
en_stop = set(stopwords.words('english'))

In [57]:
# Load the Yelp dataset 
yelp_datafile = pd.read_json("/kaggle/input/yelp-dataset/yelp_academic_dataset_tip.json", lines=True)

print('List of all columns')
print(list(yelp_datafile))

all_sentences = list(yelp_datafile['text']) 
part_of_sentences = all_sentences[0:700] 

print("\nSample of Sentences:")
for sentence in part_of_sentences[:10]:
    print(sentence)


List of all columns
['user_id', 'business_id', 'text', 'date', 'compliment_count']

Sample of Sentences:
Avengers time with the ladies.
They have lots of good deserts and tasty cuban sandwiches
It's open even when you think it isn't
Very decent fried chicken
Appetizers.. platter special for lunch
Chili Cup + Single Cheeseburger with onion, pickle, and relish + Vanilla Coca-Cola...so far.
Saturday, Dec 7th 2013, ride Patco's Silver Sleigh w/ Santa & his elves on a decorated train into Center City. Trains leave from Lindenwold at 10am, 11:15am, & 12:30pm, and make all stops. Great for kids!
This is probably the best place in the cool Springs area to watch a game and eat
Tacos
Starbucks substitute in boring downtown Tampa. Ugh. Never again!


In [58]:
def process_text(review):
    review = re.sub(r'\s+', ' ', review, flags=re.I) 
    review = re.sub(r'\W', ' ', str(review)) 
    cleaned_doc = re.sub(r'http\S+|www\S+|[\w\.-]+@[\w\.-]+', '', review)
    review = re.sub(r'\s+[a-zA-Z]\s+', ' ', review) 
    review = re.sub(r'[^a-zA-Z\s]', '', review) 
    review = review.lower() 
    tokens = review.split()
   
    lemmatizer = WordNetLemmatizer()
    lemma_txt = [lemmatizer.lemmatize(word) for word in tokens]
    

    lemma_no_stop_txt = [word for word in lemma_txt if word not in en_stop]
    tokens = [word for word in tokens if len(word) > 3]
    
    uniqes_words = list(set(tokens))
    uniqes_words.sort()

    return uniqes_words

In [59]:
cleaned_reviews = [ process_text(review) for review in part_of_sentences]

In [60]:
print(cleaned_reviews[:10])

[['avengers', 'ladies', 'time', 'with'], ['cuban', 'deserts', 'good', 'have', 'lots', 'sandwiches', 'tasty', 'they'], ['even', 'open', 'think', 'when'], ['chicken', 'decent', 'fried', 'very'], ['appetizers', 'lunch', 'platter', 'special'], ['cheeseburger', 'chili', 'coca', 'cola', 'onion', 'pickle', 'relish', 'single', 'vanilla', 'with'], ['center', 'city', 'decorated', 'elves', 'from', 'great', 'into', 'kids', 'leave', 'lindenwold', 'make', 'patco', 'ride', 'santa', 'saturday', 'silver', 'sleigh', 'stops', 'train', 'trains'], ['area', 'best', 'cool', 'game', 'place', 'probably', 'springs', 'this', 'watch'], ['tacos'], ['again', 'boring', 'downtown', 'never', 'starbucks', 'substitute', 'tampa']]


In [61]:
fast_Text_model = FastText(sentences=cleaned_reviews,
vector_size=300, 
window=3,
min_count=1, 
sample=1e-2, 
workers = 4, 
sg=1, 
epochs=300) 
fast_Text_model.save("Custom_FastText")

In [62]:
# Load gensim model
fast_Text_model = Word2Vec.load("/kaggle/working/Custom_FastText") 

In [74]:
def find_top_n(word, words, model):
    if word not in model.wv:
        print(f"The word '{word}' is not in the model vocabulary.")

    current_word_embedding = model.wv[word]
    similarity_results = []
 
    for other_word in words:
        if other_word != word and other_word in model.wv:
            other_word_embedding = model.wv[other_word]
            similarity = cosine_similarity(current_word_embedding, other_word_embedding)
            similarity_results.append((other_word, similarity))

    similarity_results.sort(key=lambda x: x[1], reverse=True)

    return similarity_results [:10],similarity_results [-10:]

In [83]:
words = list(fast_Text_model.wv.key_to_index)  
print(words)



In [93]:
pretrained_fastText_en = load_facebook_model('/kaggle/working/cc.en.300.bin')

In [100]:
for i in range (3):
    current_word=random.choice(words)
    words.remove(current_word)
    top_words_custom,down_words_custom=find_top_n(current_word, words, fast_Text_model)
    top_words_pretrain,down_words_pretrain=find_top_n(current_word, words, pretrained_fastText_en)
    print(f"The current word is: {current_word}")
    print(f"\nTop 10 similar words (custom model):")
    for w, sim in top_words_custom:
        print(f"{w}: {sim:.2f}")
    print("\nTop 10 dissimilar words (custom model):")
    for w, sim in down_words_custom:
        print(f"{w}: {sim:.2f}")
    print(f"\nTop 10 similar words (pretrained model):")
    for w, sim in top_words_pretrain:
        print(f"{w}: {sim:.2f}")
    print("\nTop 10 dissimilar words ({pretrained model}):")
    for w, sim in down_words_pretrain:
        print(f"{w}: {sim:.2f}")   
    print("\n" + "-" * 40 + "\n")

The current word is: flavor

Top 10 similar words (custom model):
flavorful: 0.90
flavors: 0.90
favor: 0.81
behind: 0.78
cute: 0.76
garden: 0.75
district: 0.74
deff: 0.72
pineapple: 0.72
alternative: 0.72

Top 10 dissimilar words (custom model):
small: 0.17
such: 0.17
what: 0.16
unless: 0.16
waiter: 0.16
vodka: 0.16
wine: 0.13
sample: 0.13
your: 0.13
with: 0.11

Top 10 similar words (pretrained model):
flavors: 0.81
flavorful: 0.66
taste: 0.64
spice: 0.57
texture: 0.55
deliciousness: 0.54
tastes: 0.50
tastier: 0.45
tasty: 0.45
salty: 0.45

Top 10 dissimilar words ({pretrained model}):
sarah: -0.05
sent: -0.05
moved: -0.05
amazingingly: -0.05
lindenwold: -0.06
australia: -0.06
juniors: -0.06
closed: -0.06
carl: -0.07
totalled: -0.11

----------------------------------------

The current word is: domestic

Top 10 similar words (custom model):
actual: 0.77
docks: 0.76
nail: 0.76
bottle: 0.75
favorite: 0.73
birthday: 0.73
professionalism: 0.73
sunset: 0.73
cozy: 0.70
matcha: 0.70

Top 10 d

In [101]:
import random
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet

def save_to_pdf(data, filename):
    doc = SimpleDocTemplate(filename, pagesize=letter)
    Story = []
    styles = getSampleStyleSheet()
    
    for line in data:
        p = Paragraph(line, styles["Normal"])
        Story.append(p)
        Story.append(Spacer(1, 12))
        
    doc.build(Story)
    print(f"Output saved to {filename}")

custom_model_output = []
pretrained_model_output = []

for i in range(3):
    current_word = random.choice(words)
    words.remove(current_word)

    top_words_custom, down_words_custom = find_top_n(current_word, words, fast_Text_model)
    top_words_pretrain, down_words_pretrain = find_top_n(current_word, words, pretrained_fastText_en)

    custom_model_output.append(f"The current word is: {current_word}\n")
    custom_model_output.append("Top 10 similar words (custom model):")
    for w, sim in top_words_custom:
        custom_model_output.append(f"{w}: {sim:.2f}")
    custom_model_output.append("\nTop 10 dissimilar words (custom model):")
    for w, sim in down_words_custom:
        custom_model_output.append(f"{w}: {sim:.2f}")

    pretrained_model_output.append(f"The current word is: {current_word}\n")
    pretrained_model_output.append("Top 10 similar words (pretrained model):")
    for w, sim in top_words_pretrain:
        pretrained_model_output.append(f"{w}: {sim:.2f}")
    pretrained_model_output.append("\nTop 10 dissimilar words (pretrained model):")
    for w, sim in down_words_pretrain:
        pretrained_model_output.append(f"{w}: {sim:.2f}")

    custom_model_output.append("\n" + "-" * 40 + "\n")
    pretrained_model_output.append("\n" + "-" * 40 + "\n")

save_to_pdf(custom_model_output, "custom_model_results.pdf")
save_to_pdf(pretrained_model_output, "pretrained_model_results.pdf")


Output saved to custom_model_results.pdf
Output saved to pretrained_model_results.pdf
