In [8]:

# Sentiment analysis, also known as opinion mining, is the process of using natural language processing (NLP) and text analysis techniques 
# to determine the emotional tone behind a body of text. It is commonly used to understand the sentiments expressed in user-generated content, 
# such as reviews, social media posts, and comments. Sentiment analysis can categorize text as positive, negative, neutral, or even identify 
# specific emotions like joy, anger, or sadness.

# Applications of Sentiment Analysis
# Customer Feedback: Analyzing reviews or feedback to understand customer satisfaction.
# Social Media Monitoring: Assessing public opinion about brands, products, or events.
# Market Research: Gaining insights into consumer behavior and trends.
# Support Systems: Automatically routing negative comments to customer service representatives for quicker resolution.

# NLTK and VADER:
# NLTK (Natural Language Toolkit) is a leading platform for building Python programs to work with human language data. It provides 
# easy-to-use interfaces to over 50 corpora and lexical resources such as WordNet, along with a suite of text-processing libraries for 
# classification, tokenization, stemming, tagging, parsing, and semantic reasoning.

# VADER (Valence Aware Dictionary and sEntiment Reasoner) is a sentiment analysis tool specifically designed to work with social media text. 
# It is part of the NLTK library and is known for its high accuracy in analyzing sentiments in text from microblogs, such as tweets.


In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException, NoSuchElementException
import time
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from collections import defaultdict
import operator

# Initialize NLTK's VADER (Valence Aware Dictionary and sEntiment Reasoner)
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

# Initialize the WebDriver
driver = webdriver.Chrome()

# Set the URL
url = "https://www.yelp.com/search?find_desc=Mexican+Food&find_loc=new+york+city&start=0"

# Open the URL
driver.get(url)

# Dictionary to store the sentiment ratings
restaurant_ratings = defaultdict(list)

def scrape_restaurant_data():
    for _ in range(3):  # Loop only 3 times
        try:
            # Wait for the links to be present
            WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a[href*='/biz/']")))
        
            # Find the links to individual restaurants
            restaurant_links = driver.find_elements(By.CSS_SELECTOR, "a[href*='/biz/']")
        
            if not restaurant_links:
                print("No restaurant links found. Please check the CSS selector.")
                break
            else:
                print(f"Found {len(restaurant_links)} restaurant links.")
        
            # Limit to the first 10 restaurants
            restaurant_links = restaurant_links[:5]  # Reduced to 5 to lessen the load
        
            # Iterate through each restaurant link
            for link in restaurant_links:
                try:
                    # Open the link in a new tab
                    driver.execute_script("window.open(arguments[0].href, '_blank');", link)
                    # Switch to the new tab
                    driver.switch_to.window(driver.window_handles[-1])
        
                    # Wait for the content to load
                    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        
                    # Attempt to get the restaurant name
                    try:
                        restaurant_name_element = WebDriverWait(driver, 20).until(
                            EC.visibility_of_element_located((By.CLASS_NAME, "css-166la90"))
                        )
                        restaurant_name = restaurant_name_element.text
                        print(f"Restaurant Name: {restaurant_name}")
                    except TimeoutException:
                        restaurant_name = "Unknown"
                        print("Restaurant name not found or timed out.")
        
                    # Attempt to get the restaurant address
                    try:
                        address_element = WebDriverWait(driver, 20).until(
                            EC.visibility_of_element_located((By.CLASS_NAME, "css-1bmgof7"))
                        )
                        address = address_element.text
                        print(f"Address: {address}")
                    except TimeoutException:
                        address = "Unknown"
                        print("Address not found or timed out.")
        
                    # Locate and perform sentiment analysis on customer reviews
                    reviews = driver.find_elements(By.CSS_SELECTOR, "span.raw__09f24__T4Ezm")
                    print("Reviews:")
                    for review in reviews:
                        review_text = review.text.strip()
                        if review_text.startswith("Start your review of"):
                            review_text = review_text.replace("Start your review of ", "")
                        print(f"Review: {review_text}")
        
                        # Perform sentiment analysis
                        sentiment_score = sid.polarity_scores(review_text)['compound']
                        # Map sentiment score to a scale of 1 to 10
                        sentiment_rating = round((sentiment_score + 1) * 5, 1)
                        print(f"Sentiment Rating: {sentiment_rating}/10")
        
                        # Add the sentiment rating to the restaurant's list of ratings
                        restaurant_ratings[restaurant_name].append(sentiment_rating)
        
                    # Close the new tab
                    driver.close()
        
                    # Switch back to the original tab
                    driver.switch_to.window(driver.window_handles[0])
        
                    # Wait a bit before continuing (to avoid overwhelming the server)
                    time.sleep(2)
        
                except (StaleElementReferenceException, TimeoutException, NoSuchElementException) as e:
                    print(f"Error occurred: {e}")
                    driver.close()
                    driver.switch_to.window(driver.window_handles[0])
                    continue
        
        except TimeoutException:
            print("Timed out waiting for restaurant links.")
            driver.refresh()
        
        time.sleep(5)  # Added delay between loops

    driver.quit()

scrape_restaurant_data()

# Calculate the average rating for each restaurant
average_ratings = {restaurant: sum(ratings) / len(ratings) for restaurant, ratings in restaurant_ratings.items()}

# Sort the restaurants by average rating
sorted_ratings = sorted(average_ratings.items(), key=operator.itemgetter(1), reverse=True)

print(sorted_ratings)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ryan_\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Found 53 restaurant links.
Restaurant name not found or timed out.
Address not found or timed out.
Reviews:
Review: 102 Norfolk St
Sentiment Rating: 5.0/10
Review: New York, NY 10002
Sentiment Rating: 5.0/10
Review: La Contenta
Sentiment Rating: 5.0/10
Review: A small space with SUCH good food and margs!!! We visited on a Tuesday, it was empty (until we left around 8)

We ordered the shrimp tostadas appetizer (soooo good) it was so refreshing and light. We also ordered the shrimp enchiladas and the pescado tacos. Everything we had was amazing, we would definitely come back.
Sentiment Rating: 9.5/10
Review: A lot has changed in the 4 years since we were last at La Contenta. They, take reservations now, although the hostess was unable to locate ours but "squeezed" us in anyway.

The cocktails are okay, not the best but certainly not the worst. The food however, has gone majorly downhill.

We ordered the Enchiladas Suizas, which were delicious. We also order the Chile Relleno which was in

In [None]:

# This is an collection of all the reviews processed through sentiment analysis. Each review has been individually rated 
# on a scale from 1 to 10 based on its sentiment, and then these ratings have been presented in sequence. Here’s a breakdown 
# of what's happening:

# Individual Reviews: Each review from the dataset is analyzed separately for sentiment using the Vader sentiment 
# analysis tool.

# Sentiment Rating: After analysis, each review is assigned a sentiment rating, from 1 to 10, where higher numbers indicate 
# more positive sentiment and lower numbers indicate more negative sentiment.

# Aggregation: The output you're seeing is a list of these sentiment ratings, each followed by the original review text 
# that was analyzed. This gives you a quick overview of how positive or negative each review was according to the 
# feeling of the reviewer/patron.

# This approach allows you to gauge the overall sentiment distribution across all reviews, providing insights into the 
# general feelings/trends for each establishment. It's useful for quickly identifying highly positive or negative reviews.


In [None]:

# End!!
