<a href="https://colab.research.google.com/github/Ziadnasser1/Deep-Learning-Training/blob/main/fasttext_yelpdatset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading Yelp Dataset.

In [None]:
import pandas as pd
import json

path = "/kaggle/input/yelp-dataset/yelp_academic_dataset_tip.json"

data = []
with open(path, 'r') as file:
    for line in file:
        data.append(json.loads(line))

df = pd.DataFrame(data)

print(df.info())
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 908915 entries, 0 to 908914
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   user_id           908915 non-null  object
 1   business_id       908915 non-null  object
 2   text              908915 non-null  object
 3   date              908915 non-null  object
 4   compliment_count  908915 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 34.7+ MB
None


Unnamed: 0,user_id,business_id,text,date,compliment_count
0,AGNUgVwnZUey3gcPCJ76iw,3uLgwr0qeCNMjKenHJwPGQ,Avengers time with the ladies.,2012-05-18 02:17:21,0
1,NBN4MgHP9D3cw--SnauTkA,QoezRbYQncpRqyrLH6Iqjg,They have lots of good deserts and tasty cuban...,2013-02-05 18:35:10,0
2,-copOvldyKh1qr-vzkDEvw,MYoRNLb5chwjQe3c_k37Gg,It's open even when you think it isn't,2013-08-18 00:56:08,0
3,FjMQVZjSqY8syIO-53KFKw,hV-bABTK-glh5wj31ps_Jw,Very decent fried chicken,2017-06-27 23:05:38,0
4,ld0AperBXk1h6UbqmM80zw,_uN0OudeJ3Zl_tf6nxg5ww,Appetizers.. platter special for lunch,2012-10-06 19:43:09,0


# Extracting Test column to work on it.

In [None]:
dataset=df['text']
dataset.head()

0                       Avengers time with the ladies.
1    They have lots of good deserts and tasty cuban...
2               It's open even when you think it isn't
3                            Very decent fried chicken
4               Appetizers.. platter special for lunch
Name: text, dtype: object

# Preprocessing the text to work with.

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


True

In [None]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr/share/nltk_data

In [None]:
def process_text(document):
    document = document.lower()  # Convert text to lowercase
    document = re.sub(r'\s+', ' ', document)  # Remove extra white spaces
    document = re.sub(r'\W', ' ', document)  # Remove special characters
    document = re.sub(r'\d+', '', document)  # Remove numbers
    document = re.sub(r'\b[a-zA-Z]\b', '', document)  # Remove single characters

    tokens = word_tokenize(document)  # Tokenize words
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize words

    return tokens

In [None]:
df['cleaned_text'] = df['text'].apply(process_text)
df[['text', 'cleaned_text']].head()

Unnamed: 0,text,cleaned_text
0,Avengers time with the ladies.,"[avenger, time, lady]"
1,They have lots of good deserts and tasty cuban...,"[lot, good, desert, tasty, cuban, sandwich]"
2,It's open even when you think it isn't,"[open, even, think]"
3,Very decent fried chicken,"[decent, fried, chicken]"
4,Appetizers.. platter special for lunch,"[appetizer, platter, special, lunch]"


# Loading FastText model and training it

In [None]:
!pip install gensim
import gensim
from gensim.models import FastText



In [None]:
tokenized_texts = df['cleaned_text']

In [None]:
tokenized_texts[:5]

0                          [avenger, time, lady]
1    [lot, good, desert, tasty, cuban, sandwich]
2                            [open, even, think]
3                       [decent, fried, chicken]
4           [appetizer, platter, special, lunch]
Name: cleaned_text, dtype: object

In [None]:
!pip install tqdm
from tqdm import tqdm




In [None]:
from gensim.models import FastText
from tqdm.notebook import tqdm  # For progress bar in Kaggle notebooks

# Number of epochs
epochs = 5
# Initialize the tqdm progress bar for epochs
epoch_progress = tqdm(range(epochs), desc="Epochs Progress", position=0, leave=True)

# Initialize the FastText model with the given parameters
fasttext_model = FastText(
    vector_size=300,
    window=5,
    sample=1e-2,
    min_count=5,
    workers=4,
    sg=1,
    epochs=1
)

# Build the vocabulary once
fasttext_model.build_vocab(corpus_iterable=tokenized_texts)


# Training process with progress bar for each epoch
for epoch in epoch_progress:
    # Train for 1 epoch
    fasttext_model.train(
        corpus_iterable=tokenized_texts,  # This is the training corpus
        total_examples=len(tokenized_texts),
        epochs=1
    )

    # Update the tqdm progress bar
    epoch_progress.set_postfix(epoch=epoch + 1)

# Optionally save the trained model
fasttext_model.save("/kaggle/working/fasttext_model")


Epochs Progress:   0%|          | 0/5 [00:00<?, ?it/s]

# Testing the model

In [None]:
test_word = "happy"

# Get 10 similar words to the test word
similar_words = fasttext_model.wv.similar_by_word(test_word, topn=10)
print(f"10 Similar Words to '{test_word}':")
for word, similarity in similar_words:
    print(f"{word}: {similarity}")

10 Similar Words to 'happy':
happyhour: 0.7068834900856018
yappy: 0.6964982748031616
happ: 0.6833152174949646
appy: 0.6797153353691101
tappy: 0.6184215545654297
nappy: 0.6050929427146912
happier: 0.597939133644104
unhappy: 0.5932546854019165
hap: 0.5879569053649902
snappy: 0.5799431800842285


In [None]:
# Get 10 opposite words (workaround)
# We will reverse the vector of the word and find the most dissimilar words
# This can be done by querying the most dissimilar words, assuming the vector is the opposite.
# opposite_words = fasttext_model.wv.most_similar(positive=[test_word], topn=10, negative=["bad"])
# print(f"\n10 Opposite Words to '{test_word}':")
# for word, similarity in opposite_words:
#     print(f"{word}: {similarity}")

def print_opposite_words(test_word):
    all_words = list(fasttext_model.wv.index_to_key)

    similarities = [(word, fasttext_model.wv.similarity(test_word, word)) for word in all_words if word != test_word]

    negative_results = sorted(similarities, key=lambda x: x[1])[:10]
    print("\nTop 10 opposite words:")
    for word, score in negative_results:
        print(f"{word}: {score:.4f}")
print_opposite_words(test_word)


Top 10 opposite words:
amp: -0.0139
ode: 0.0155
link: 0.0219
proceed: 0.0243
channel: 0.0293
paper: 0.0307
utter: 0.0324
ink: 0.0333
kiosk: 0.0343
gh: 0.0348


In [None]:
result = fasttext_model.wv['king'] + fasttext_model.wv['woman'] - fasttext_model.wv['man']

from scipy.spatial.distance import cosine

similarity = 1 - cosine(result , fasttext_model.wv['queen'])
print(f"Similarity('King + Woman - Man', queen): {similarity:.4f}")

Similarity('King + Woman - Man', queen): 0.4360


# Testing Pretraind Model.

In [None]:
!wget -O cc.en.300.bin.gz https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz


--2025-04-02 01:58:22--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.227.219.10, 13.227.219.59, 13.227.219.70, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.227.219.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4503593528 (4.2G) [application/octet-stream]
Saving to: ‘cc.en.300.bin.gz’


2025-04-02 01:59:04 (103 MB/s) - ‘cc.en.300.bin.gz’ saved [4503593528/4503593528]



In [None]:
!gunzip cc.en.300.bin.gz


gzip: cc.en.300.bin already exists; do you wish to overwrite (y or n)? ^C


In [None]:
from gensim.models.fasttext import load_facebook_model

# Load the pretrained FastText model in Facebook's format
pretrained_fasttext_path = "cc.en.300.bin"
pretrained_model = load_facebook_model(pretrained_fasttext_path)

print("Pretrained FastText model loaded successfully!")


Pretrained FastText model loaded successfully!


In [None]:
test_word = "happy"
similar_words = pretrained_model.wv.most_similar(test_word, topn=10)

print(f"\n10 Similar Words to '{test_word}':")
for word, similarity in similar_words:
    print(f"{word}: {similarity}")


10 Similar Words to 'happy':
happpy: 0.7490981817245483
hapy: 0.7035642266273499
happier: 0.6981650590896606
pleased: 0.697956919670105
glad: 0.6735867261886597
satisfied: 0.6564764380455017
super-happy: 0.6552473902702332
unhappy: 0.6515626311302185
thrilled: 0.650614857673645
overjoyed: 0.6238875985145569


In [None]:
def print_opposite_words(test_word):
    all_words = list(pretrained_model.wv.index_to_key)

    similarities = [(word, pretrained_model.wv.similarity(test_word, word)) for word in all_words if word != test_word]

    opposite_words = sorted(similarities, key=lambda x: x[1])[:10]

    print(f"\n10 Opposite Words to '{test_word}':")
    for word, score in opposite_words:
        print(f"{word}: {score:.4f}")

print_opposite_words(test_word)



10 Opposite Words to 'happy':
M.e.: -0.2306
alexanderrados: -0.2287
Lithologic: -0.2222
area-of-interest: -0.2164
RFG29PHDWP: -0.2154
CyberWar: -0.2146
DefenseC: -0.2064
Classifications: -0.2059
Sheathing: -0.2055
MoreUse: -0.2038


In [None]:
result = pretrained_model.wv['king'] + pretrained_model.wv['woman'] - pretrained_model.wv['man']

from scipy.spatial.distance import cosine

similarity = 1 - cosine(result , pretrained_model.wv['queen'])
print(f"Similarity('King + Woman - Man', queen): {similarity:.4f}")

Similarity('King + Woman - Man', queen): 0.6543


# Updating Pretrained model with Yelp Dataset.

In [None]:
pretrained_model.build_vocab(tokenized_texts, update=True)
print("Vocabulary updated with Yelp_tip data!")

Vocabulary updated with Yelp_tip data!


In [None]:
from tqdm.notebook import tqdm

# Define the number of epochs
epochs =5

# Initialize tqdm for progress tracking
epoch_progress = tqdm(range(epochs), desc="Fine-tuning Progress", position=0, leave=True)

# Fine-tune the model
for epoch in epoch_progress:
    pretrained_model.train(tokenized_texts, total_examples=len(tokenized_texts), epochs=1)

Fine-tuning Progress:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
test_word = "happy"

# Get 10 similar words
similar_words = pretrained_model.wv.most_similar(test_word, topn=10)

print(f"\n10 Similar Words to '{test_word}':")
for word, similarity in similar_words:
    print(f"{word}: {similarity}")



10 Similar Words to 'happy':
happy-happy: 0.9950555562973022
happy.Also: 0.9911419749259949
happys: 0.990406334400177
happy.A: 0.9901679158210754
happy.At: 0.9900670647621155
happy--: 0.9899594187736511
happy-ish: 0.9899420142173767
happy.So: 0.9893172979354858
happy.Now: 0.9892703890800476
happy.As: 0.9891560673713684


In [None]:

def print_opposite_words(test_word):
    all_words = list(pretrained_model.wv.index_to_key)

    similarities = [(word, pretrained_model.wv.similarity(test_word, word)) for word in all_words if word != test_word]

    opposite_words = sorted(similarities, key=lambda x: x[1])[:10]

    print(f"\n10 Opposite Words to '{test_word}':")
    for word, score in opposite_words:
        print(f"{word}: {score:.4f}")

print_opposite_words(test_word)


10 Opposite Words to 'happy':
EHAC: -0.1376
.......................................................................................................................................: -0.1248
.4.4: -0.1244
TitleTop: -0.1242
.........................................................................................................................................: -0.1241
..............................................................................................................................................: -0.1238
..........................................................................................................................................: -0.1234
...........................................................................................................................................: -0.1230
.....................................................................................................................................: -0.1226
.......................................