In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/Tampa_mexican_restaurants.csv')
df = df.drop(columns=['state', 'latitude', 'longitude','address'])

#remove html
from bs4 import BeautifulSoup

def remove_html_tags(html):
    soup = BeautifulSoup(html, "html.parser")
    return soup.get_text()

df.loc[:, 'cleaned_text'] = df['text'].apply(remove_html_tags)

# remove punctuation
import re

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

df.loc[:, 'cleaned_text'] = df['text'].apply(remove_punctuation)

# remove phone numbers
def remove_phone_numbers(text):
    pattern = r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
    return re.sub(pattern, '', text)

df.loc[:, 'cleaned_text'] = df['text'].apply(remove_punctuation)

  soup = BeautifulSoup(html, "html.parser")


In [3]:
business_review_counts = df['business_id'].value_counts()

# only leave restaurants with more than 100 reviews
business_ids_with_100plus_reviews = business_review_counts[business_review_counts > 100].index

filtered_df = df[df['business_id'].isin(business_ids_with_100plus_reviews)]

In [4]:
filtered_df.loc[:, 'cleaned_text'] = filtered_df['cleaned_text'].str.lower()

In [6]:
pip install spacy

Collecting spacy
  Downloading spacy-3.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.10-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.2 kB)
Collecting thinc<8.3.0,>=8.1.8 (from spacy)
  Downloading thinc-8.2.1-cp310-cp310-manylinux_2_17_x86_64.manylin

In [5]:
import spacy
nlp = spacy.load("en_core_web_sm")

def tokenize_text(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    return tokens

filtered_df['token'] = filtered_df['cleaned_text'].apply(tokenize_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['token'] = filtered_df['cleaned_text'].apply(tokenize_text)


In [6]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [token for token in tokens if token.lower() not in stop_words]

filtered_df['filtered_token'] = filtered_df['token'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['filtered_token'] = filtered_df['token'].apply(remove_stopwords)


In [7]:
from collections import Counter

def create_ngrams(token_list, nb_elements):
    ngrams = zip(*[token_list[index_token:] for index_token in range(nb_elements)])
    return (" ".join(ngram) for ngram in ngrams)

def frequent_words(list_words, ngrams_number=1, number_top_words=10):
    frequent = []
    if ngrams_number == 1:
        pass
    elif ngrams_number >= 2:
        list_words = create_ngrams(list_words, ngrams_number)
    else:
        raise ValueError("number of n-grams should be >= 1")
    counter = Counter(list_words)
    frequent = counter.most_common(number_top_words)
    return frequent

In [8]:
most_frequent_bigrams = frequent_words(filtered_df['filtered_token'].explode().dropna(), ngrams_number=3, number_top_words=10)

for bigram, count in most_frequent_bigrams:
    if bigram:
        print(f"{bigram}:{count}")

ca nt wait:559
authentic mexican food:555
best mexican food:345
good mexican food:270
nt wait go:229
wait go back:227
pico de gallo:223
ca nt go:210
mexican food tampa:195
definitely come back:194


In [17]:
# Define the range of 'star_x' values from 1 to 2
min_star_x = 1.0
max_star_x = 2.0 

filtered_df = filtered_df[(filtered_df['stars_x'] >= min_star_x) & (filtered_df['stars_x'] <= max_star_x)]

list_of_tokens = filtered_df['filtered_token'].explode().dropna()

most_frequent_words_1 = frequent_words(list_of_tokens, ngrams_number=1, number_top_words=12)

for word, count in most_frequent_words_1:
    if word and word.strip():  # Check if word is not empty or just whitespace
        print(f"{word}:{count}")

food:6220
nt:5842
place:2845
good:2422
service:2403
like:2345
order:2202
would:2132
ordered:2101
time:2067


In [13]:
most_frequent_words_2 = frequent_words(list_of_tokens, ngrams_number=2, number_top_words=10)

for word, count in most_frequent_words_2:
    if word and word.strip():  # Check if word is not empty or just whitespace
        print(f"{word}:{count}")

mexican food:585
wo nt:541
nt even:458
taco bus:366
could nt:354
ca nt:352
chips salsa:336
tasted like:311
go back:293
customer service:288


In [14]:
most_frequent_words_3 = frequent_words(list_of_tokens, ngrams_number=3, number_top_words=10)

for word, count in most_frequent_words_3:
    if word and word.strip():  # Check if word is not empty or just whitespace
        print(f"{word}:{count}")

wo nt back:157
authentic mexican food:79
never go back:64
could nt even:56
  wo nt:51
probably wo nt:49
mexican food  :48
wo nt going:48
good mexican food:48
wo nt go:41


In [12]:
# Define the range of 'star_x' values from 1 to 2
filtered_df2 = filtered_df[(filtered_df['stars_x'] >= 3.0) & (filtered_df['stars_x'] <= 4.0)]

list_of_tokens2 = filtered_df['filtered_token'].explode().dropna()