In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re
import gensim
from gensim.models import FastText
from gensim.models import Word2Vec
import json
import random

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ali\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ali\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ali\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:


# Load the JSON file
tips = []
with open('C:\\Users\\ali\\OneDrive\\Desktop\\yelp_academic_dataset_tip.json', 'r') as f:
    for line in f:
        tip = json.loads(line)
        tips.append(tip)

# Convert the list of dictionaries to a pandas DataFrame
df = pd.DataFrame(tips)
# Save the DataFrame as a CSV file
df.to_csv('yelp_tips.csv', index=False)

In [3]:
tips_df = pd.read_csv('yelp_tips.csv')

# Extract the 'text' attribute from the tips file
corpus = tips_df['text']
print(corpus)

0                            Avengers time with the ladies.
1         They have lots of good deserts and tasty cuban...
2                    It's open even when you think it isn't
3                                 Very decent fried chicken
4                    Appetizers.. platter special for lunch
                                ...                        
908910                Disappointed in one of your managers.
908911                              Great food and service.
908912                                  Love their Cubans!!
908913                              Great pizza great price
908914                    Food is good value but a bit hot!
Name: text, Length: 908915, dtype: object


In [4]:
# Define functions for preprocessing
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    clean = re.sub(r'[^\w\s]', '', text)
    normalized=clean.lower()
    # Tokenize the text
    tokens = word_tokenize(normalized)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens
corpus = corpus.dropna()
# Apply preprocessing to each document in the corpus
processed_corpus = corpus.apply(preprocess_text)

In [5]:
print(processed_corpus)

0                               [avenger, time, lady]
1         [lot, good, desert, tasty, cuban, sandwich]
2                           [open, even, think, isnt]
3                            [decent, fried, chicken]
4                [appetizer, platter, special, lunch]
                             ...                     
908910                   [disappointed, one, manager]
908911                         [great, food, service]
908912                                  [love, cuban]
908913                   [great, pizza, great, price]
908914                  [food, good, value, bit, hot]
Name: text, Length: 908901, dtype: object


In [6]:
# Train FastText model
model_fasttext = FastText(processed_corpus, window=5, min_count=5, workers=4, sg=1)


In [7]:



random_words = random.sample(model_fasttext.wv.index_to_key, 20)

# Find closest and furthest words for each random word
results = {}
for word in random_words:
    closest = model_fasttext.wv.most_similar(word, topn=10)
    furthest = model_fasttext.wv.most_similar(negative=[word], topn=10)
    results[word] = {'closest': closest, 'furthest': furthest}

# Print results
for word, similar_words in results.items():
    print("Word:", word)
    print("Closest:", [w[0] for w in similar_words['closest']])
    print("Furthest:", [w[0] for w in similar_words['furthest']])
    print()


Word: amaretto
Closest: ['sorbetto', 'cioccolato', 'ganache', 'pistachio', 'marscapone', 'macadamia', 'meringue', 'creama', 'cardamom', 'crème']
Furthest: ['senior', 'inspected', 'inspect', 'safety', 'depart', 'inspector', 'departs', 'rate', 'inspection', 'resource']

Word: favs
Closest: ['faves', 'fav', 'favorite', 'fave', 'fava', 'favorito', 'favourite', 'favour', 'wordawesome', 'musthaves']
Furthest: ['urn', 'rm', 'enforce', 'enforced', 'urgency', 'emergency', 'violation', 'wage', 'internet', 'vaccine']

Word: lusherpride
Closest: ['guardiansofthegroove', 'phrasesfromplaces', 'nolalivemusic', 'dirtycoast', 'civicnola', 'algiersferry', 'uptownnola', 'lusher', 'thedandywarhols', 'nolaliving']
Furthest: ['accommodate', 'split', 'accommodated', 'carryout', 'ordering', 'amount', 'requested', 'request', 'offered', 'medium']

Word: torture
Closest: ['pasture', 'breathing', 'rapture', 'gesture', 'breather', 'snide', 'interruption', 'posture', 'cesspool', 'streamline']
Furthest: ['ri', 'fave

In [9]:
print(random_words)

['amaretto', 'favs', 'lusherpride', 'torture', '810pm', 'filth', '5min', 'partition', 'save', 'panera', 'attacked', 'varying', 'limon', 'gras', 'reveal', 'multicolored', 'jockey', 'eaten', 'baja', 'mere']


In [8]:
from gensim.models.fasttext import load_facebook_model

pretrained_model = load_facebook_model( "C:\\Users\\ali\\OneDrive\\Desktop\\cc.en.300.bin")


# Test pretrained model
pretrained_results = {}
for word in random_words:
    closest = pretrained_model.wv.most_similar(word, topn=10)
    furthest = pretrained_model.wv.most_similar(negative=[word], topn=10)
    pretrained_results[word] = {'closest': closest, 'furthest': furthest}

# Print results
for word, similar_words in pretrained_results.items():
    print("Word:", word)
    print("Closest:", [w[0] for w in similar_words['closest']])
    print("Furthest:", [w[0] for w in similar_words['furthest']])
    print()

Word: amaretto
Closest: ['Amaretto', 'Frangelico', 'frangelico', 'liqueur', 'kahlua', 'anisette', 'cointreau', 'amaretti', 'limoncello', 'Kahlua']
Furthest: ['SDMS', 'ASTRO', 'HealthWatch', 'ITEX', 'KAMS', 'SEWA', 'SEMI', 'IDSA', 'SGIA', 'SSTI']

Word: favs
Closest: ['faves', 'fav', 'fave', 'fav.', 'favs.', 'favorties', 'FAVE', 'favortie', 'fave.', 'favorites']
Furthest: ['...................................................................................................................................', '..........................................................................................................................................', '.........................................................................................................................................', '............................................................................................................................................', '..............................................................