In [5]:
import pandas as pd
from transformers import pipeline


In [6]:
# the model pipeline
model = pipeline("sentiment-analysis", model="noahnsimbe/DistilBERT-yelp-sentiment-analysis")


In [7]:


# Load the businesses
businesses_df = pd.read_json('./yelp_dataset/yelp_academic_dataset_business.json', lines=True)


In [8]:
def categorize_review(stars):
    if stars in [1, 2]:
        return 'Negative'
    elif stars == 3:
        return 'Neutral'
    elif stars in [4, 5]:
        return 'Positive'
    else:
        return 'Unknown'  # for any star ratings that are not 1, 2, 3, 4, or 5


In [9]:

# Filters for restaurants and hotels, limiting to the first 100 of each for demonstration
restaurants = businesses_df[businesses_df['categories'].str.contains('Restaurants', na=False)].head(1000)
hotels = businesses_df[businesses_df['categories'].str.contains('Hotels', na=False)].head(1000)

# Marks each business as either a Restaurant or Hotel
restaurants['business_type'] = 'Restaurant'
hotels['business_type'] = 'Hotel'

# Combines and retrieves the IDs, names, and types
selected_businesses = pd.concat([restaurants, hotels])

In [10]:

selected_businesses = selected_businesses.drop_duplicates(subset=['business_id'])


business_info = selected_businesses.set_index('business_id')[['name', 'business_type']].to_dict('index')


In [11]:
selected_reviews = pd.DataFrame()

# Ensure you have the correct variable for filtered business IDs if needed
filtered_business_ids = selected_businesses['business_id'].tolist()

# Process the review dataset in chunks and filter for selected businesses
for chunk in pd.read_json('./yelp_dataset/yelp_academic_dataset_review.json', lines=True, chunksize=5000):
    # Filter for reviews related to selected businesses
    filtered_chunk = chunk[chunk['business_id'].isin(filtered_business_ids)].copy()
    # Add the business name and type to each review using the business_info dictionary
    filtered_chunk['business_name'] = filtered_chunk['business_id'].apply(lambda x: business_info[x]['name'])
    filtered_chunk['business_type'] = filtered_chunk['business_id'].apply(lambda x: business_info[x]['business_type'])
    # Categorize the reviews based on star ratings
    filtered_chunk['sentiment'] = filtered_chunk['stars'].apply(categorize_review)
    # Concatenate the chunk to the selected_reviews DataFrame
    selected_reviews = pd.concat([selected_reviews, filtered_chunk], ignore_index=True)


In [12]:
# Selecting the specified columns for the final DataFrame
final_reviews_selected = selected_reviews[['business_id','business_name','business_type','text', 'stars', 'sentiment']]



In [13]:
print(final_reviews_selected.head())

              business_id                 business_name business_type  \
0  XQfwVwDr-v0ZS3_CbbE5Xw  Turning Point of North Wales    Restaurant   
1  04UD14gamNjLY0IDYVhHJg                      Dmitri's    Restaurant   
2  gmjsEdUsKpj9Xxu6pdjH0g    The Voodoo Bone Lady Tours         Hotel   
3  lj-E32x9_FA7GmUrBGBEWg           Brio Italian Grille    Restaurant   
4  0pMj5xUAecW9o1P35B0AMw                          Wawa    Restaurant   

                                                text  stars sentiment  
0  If you decide to eat here, just be aware it is...      3   Neutral  
1  I am a long term frequent customer of this est...      1  Negative  
2  Loved this tour! I grabbed a groupon and the p...      5  Positive  
3  Love going here for happy hour or dinner!  Gre...      4  Positive  
4  Great staff always helps and always nice. Alwa...      5  Positive  


In [14]:
# Save the selected columns to a CSV file
final_reviews_selected.to_csv('final_reviews_selected.csv', index=False)


In [41]:
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK data (only needs to be done once)
nltk.download('stopwords')
nltk.download('wordnet')

# Set stopwords
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matthewkolawole/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/matthewkolawole/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [42]:
df = pd.read_csv('final_reviews_selected.csv')


In [43]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'https?:\/\/\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[\W_]+', ' ', text)
    words = [word for word in text.split() if word not in stop_words]
    return ' '.join([lemmatizer.lemmatize(word) for word in words])
df['text'] = df['text'].apply(clean_text)


In [44]:
sequence_lengths = df['text'].apply(lambda x: len(x.split()))
print(sequence_lengths.describe())


count    154313.000000
mean         56.893392
std          52.061750
min           0.000000
25%          23.000000
50%          41.000000
75%          72.000000
max         531.000000
Name: text, dtype: float64


In [45]:
missing_values = df.isnull().sum()
print(missing_values)


business_id      0
business_name    0
business_type    0
text             0
stars            0
sentiment        0
dtype: int64


In [46]:
print(df['sentiment'].value_counts())


Positive    101121
Negative     36346
Neutral      16846
Name: sentiment, dtype: int64


In [47]:
# Converts sentiment to numerical
df['sentiment'] = df['sentiment'].map({'Positive': 2, 'Neutral': 1, 'Negative': 0})


In [48]:
#addressing imbalance classes
from sklearn.utils import class_weight
# Calculate class weights
class_weights = class_weight.compute_class_weight(
    'balanced',
    classes=np.unique(df['sentiment']),
    y=df['sentiment']
)
class_weight_dict = dict(enumerate(class_weights))

# to be used when trainning 
np.save('class_weight_dict.npy', class_weight_dict)

In [49]:
print(df.head())

              business_id                 business_name business_type  \
0  XQfwVwDr-v0ZS3_CbbE5Xw  Turning Point of North Wales    Restaurant   
1  04UD14gamNjLY0IDYVhHJg                      Dmitri's    Restaurant   
2  gmjsEdUsKpj9Xxu6pdjH0g    The Voodoo Bone Lady Tours         Hotel   
3  lj-E32x9_FA7GmUrBGBEWg           Brio Italian Grille    Restaurant   
4  0pMj5xUAecW9o1P35B0AMw                          Wawa    Restaurant   

                                                text  stars  sentiment  
0  decide eat aware going take 2 hour beginning e...      3          1  
1  long term frequent customer establishment went...      1          0  
2  loved tour grabbed groupon price great perfect...      5          2  
3  love going happy hour dinner great patio fan b...      4          2  
4  great staff always help always nice always cle...      5          2  


In [50]:
df.to_csv('processed_reviews.csv', index=False)