In [202]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to C:\Users\Pedro
[nltk_data]     Alves\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to C:\Users\Pedro
[nltk_data]     Alves\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [203]:
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)

In [204]:
def get_cat_prod_links(url):
    response = requests.get(first_page_url)
    soup = BeautifulSoup(response.content, "html.parser")

    product_links = []

    # Step 2: Navigate to the specific elements using the provided indentations
    main_content = soup.find("main", id="mainContent", class_="responsiveProductListPage_mainContent responsiveProductListPage_mainContent_withFacets")

    if main_content:
        # Step 3: Navigate to the next level (div with class 'productListProducts')
        product_list_div = main_content.find("div", class_="productListProducts")
        if product_list_div:
            # Step 4: Navigate to the next level (ul with class 'productListProducts_products')
            product_list_ul = product_list_div.find("ul", class_="productListProducts_products")
            if product_list_ul:
                # Step 5: Navigate to each product (li with class 'productListProducts_product')
                products = product_list_ul.find_all("li", class_="productListProducts_product")
                for product in products:
                    # Step 6: Navigate to the next level (div with class 'athenaProductBlock')
                    athena_product_block_div = product.find("div", class_="athenaProductBlock")
                    if athena_product_block_div:
                        # Step 7: Extract the link (a with class 'athenaProductBlock_linkImage')
                        link_a = athena_product_block_div.find("a", class_="athenaProductBlock_linkImage")
                        if link_a and 'href' in link_a.attrs:
                            product_link = url + link_a['href']
                            product_links.append(product_link)

    return product_links

In [205]:
product_links = get_cat_prod_links("https://www.myprotein.pt/nutrition/protein/protein-isolate.list")

In [206]:
def get_paginated_reviews(reviews_url):

    response = requests.get(reviews_url)
    soup = BeautifulSoup(response.content, "html.parser")

    # first page reviews
    reviews = []
    for review in soup.find_all(class_="athenaProductReviews_review"):
        review_text = review.find(class_='athenaProductReviews_reviewContent').text
        review_stars = review.find(class_="athenaProductReviews_reviewRatingStarsContainer")['aria-label']
        reviews.append((review_text, review_stars, reviews_url))

    # paginated reviews
    try:
        next_page_url = soup.find("a", class_="athenaProductReviews_paginationNav athenaProductReviews_paginationNav-next")["href"]
    except TypeError:
        next_page_url = False

    if next_page_url:
        while next_page_url:
            response = requests.get(next_page_url)
            soup = BeautifulSoup(response.content, "html.parser")

            for review in soup.find_all(class_="athenaProductReviews_review"):
                review_text = review.find(class_='athenaProductReviews_reviewContent').text
                review_stars = review.find(class_="athenaProductReviews_reviewRatingStarsContainer")['aria-label']
                reviews.append((review_text, review_stars, next_page_url))

            try:
                next_page_url = soup.find("a", class_="athenaProductReviews_paginationNav athenaProductReviews_paginationNav-next")["href"]
            except TypeError:
                break 

    return reviews
    

def get_first_page_reviews(soup, url):
    reviews = []
    # Step 2: Navigate to the specific elements using the provided indentations
    main_content = soup.find("main", id="mainContent", class_="athenaProductPage")
    if main_content:
        review_div = soup.find("div", class_="athenaProductPage_productReviews")
        if review_div:
            prod_review_div = review_div.find("div", class_="athenaProductReviews")
            if prod_review_div:
                if prod_review_div.find("div", class_="athenaProductReviews_empty"):
                    reviews.append((None, None, url))
                elif prod_review_div.find("div", class_="athenaProductReviews_summary"):
                    review_container = prod_review_div.find("div", class_="athenaProductReviews_summary_reviewContainer")
                    if review_container:
                        summ_cols = review_container.find("div", class_="athenaProductReviews_summary-columns")
                        if summ_cols:
                            summ_right = summ_cols.find("div", class_="athenaProductReviews_summary-right")
                            if summ_right:
                                top_review_div = summ_right.find("div", class_="athenaProductReviews_topReviews")
                                if top_review_div:
                                    review_containers = top_review_div.find_all("div", class_="athenaProductReviews_topReviewSingle")
                                    # Loop through each review container and extract the text
                                    for review_container in review_containers:
                                        review_text = review_container.find("p", class_="athenaProductReviews_topReviewsExcerpt").get_text(strip=True) if review_container.find("p", class_="athenaProductReviews_topReviewsExcerpt") else None
                                        review_stars = review_container.find("div", class_="athenaProductReviews_topReviewsRatingStarsContainer")['aria-label']
                                        reviews.append((review_text, review_stars, url))
    else:
        reviews.append((None, None, url))
    
    return reviews


In [207]:
def get_reviews(product_links):

    reviews = []
    reviews_to_return = []
    for prod_link in product_links:
        response = requests.get(prod_link)
        soup = BeautifulSoup(response.content, "html.parser")

        # navigate to reviews page
        try:
            reviews_url = soup.find(class_="athenaProductReviews_seeReviewsButton")['href']
        except TypeError:
            reviews_url = False

        if reviews_url:
            reviews = get_paginated_reviews(reviews_url)
        elif reviews_url == False:
            reviews = get_first_page_reviews(soup, prod_link)
        else:
            reviews.append((None, None, prod_link))
        
        reviews_to_return += reviews 

    return reviews_to_return

total_reviews = get_reviews(product_links)

In [208]:
review_text, stars, url = zip(*total_reviews)
df = pd.DataFrame({'stars': stars, 'review': review_text, 'url': url})
# select only number of stars
df['stars'] = df['stars'].str.split(" ").str[0]
# remove odd whitespaces
df['review'] = df['review'].str.strip().str.replace(r'\s+', ' ', regex=True).str.replace("\n","")
df['product_name'] = df['url'].str.split("/").str[-2]
df['review'].fillna('No review available', inplace=True)

In [209]:
# Function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize text
    tokens = word_tokenize(text)

    return ' '.join(tokens)

df['processed_review'] = df['review'].apply(preprocess_text)

In [210]:
def get_sentiment_score(text):
    if isinstance(text, str):
        sentiment = analyzer.polarity_scores(text)
        return sentiment
    else:
        # Handle non-string (float) cases, return np.nan
        return {'neg': None, 'neu': None, 'pos': None, 'compound': None}


analyzer = SentimentIntensityAnalyzer()
sentiment_scores = df['processed_review'].apply(get_sentiment_score)
df[['Negative', 'Neutral', 'Positive', 'Compound']] = pd.DataFrame(sentiment_scores.tolist())

In [211]:
df.groupby(by="product_name")[['Negative', 'Neutral', 'Positive', 'Compound']].mean()

Unnamed: 0_level_0,Negative,Neutral,Positive,Compound
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
clear-whey-isolada-conjunto-de-saquetas-de-amostra,0.524,0.476,0.0,-0.296
clear-whey-isolate,0.011556,0.9835,0.004944,0.018522
impact-whey-isolate,0.018994,0.936101,0.044937,0.099458
impact-whey-isolate-amostra,0.0,1.0,0.0,0.0
myprotein-clear-whey-isolate-sample,0.0,1.0,0.0,0.0
myprotein-impact-native-whey-isolate-sample,0.524,0.476,0.0,-0.296
myvegan-proteina-de-ervilha-isolada,0.0,1.0,0.0,0.0
pack-inicial-clear-whey,0.524,0.476,0.0,-0.296
proteina-de-ervilha-isolada,0.012468,0.967435,0.020081,0.02846
proteina-isolada-de-soja,0.016985,0.9585,0.024515,0.056804
