In [34]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer

from nltk.sentiment.vader import SentimentIntensityAnalyzer
# nltk.download('vader_lexicon')
# nltk.download('wordnet')
from langdetect import detect
from googletrans import Translator, LANGUAGES

[nltk_data] Downloading package punkt to C:\Users\Pedro
[nltk_data]     Alves\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [35]:
# !pip install langdetect googletrans==4.0.0-rc1

In [36]:
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)

In [37]:
def get_cat_prod_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    product_links = []

    # Step 2: Navigate to the specific elements using the provided indentations
    main_content = soup.find("main", id="mainContent", class_="responsiveProductListPage_mainContent responsiveProductListPage_mainContent_withFacets")

    if main_content:
        # Step 3: Navigate to the next level (div with class 'productListProducts')
        product_list_div = main_content.find("div", class_="productListProducts")
        if product_list_div:
            # Step 4: Navigate to the next level (ul with class 'productListProducts_products')
            product_list_ul = product_list_div.find("ul", class_="productListProducts_products")
            if product_list_ul:
                # Step 5: Navigate to each product (li with class 'productListProducts_product')
                products = product_list_ul.find_all("li", class_="productListProducts_product")
                for product in products:
                    # Step 6: Navigate to the next level (div with class 'athenaProductBlock')
                    athena_product_block_div = product.find("div", class_="athenaProductBlock")
                    if athena_product_block_div:
                        # Step 7: Extract the link (a with class 'athenaProductBlock_linkImage')
                        link_a = athena_product_block_div.find("a", class_="athenaProductBlock_linkImage")
                        if link_a and 'href' in link_a.attrs:
                            product_link = url + link_a['href']
                            product_links.append(product_link)

    return product_links

In [38]:
product_links = get_cat_prod_links("https://www.myprotein.pt/nutrition/protein/protein-isolate.list")

In [39]:
def get_paginated_reviews(reviews_url):

    response = requests.get(reviews_url)
    soup = BeautifulSoup(response.content, "html.parser")

    # first page reviews
    reviews = []
    for review in soup.find_all(class_="athenaProductReviews_review"):
        review_text = review.find(class_='athenaProductReviews_reviewContent').text
        review_stars = review.find(class_="athenaProductReviews_reviewRatingStarsContainer")['aria-label']
        reviews.append((review_text, review_stars, reviews_url))

    # paginated reviews
    try:
        next_page_url = soup.find("a", class_="athenaProductReviews_paginationNav athenaProductReviews_paginationNav-next")["href"]
    except TypeError:
        next_page_url = False

    if next_page_url:
        while next_page_url:
            response = requests.get(next_page_url)
            soup = BeautifulSoup(response.content, "html.parser")

            for review in soup.find_all(class_="athenaProductReviews_review"):
                review_text = review.find(class_='athenaProductReviews_reviewContent').text
                review_stars = review.find(class_="athenaProductReviews_reviewRatingStarsContainer")['aria-label']
                reviews.append((review_text, review_stars, next_page_url))

            try:
                next_page_url = soup.find("a", class_="athenaProductReviews_paginationNav athenaProductReviews_paginationNav-next")["href"]
            except TypeError:
                break 

    return reviews
    

def get_first_page_reviews(soup, url):
    reviews = []
    # Step 2: Navigate to the specific elements using the provided indentations
    main_content = soup.find("main", id="mainContent", class_="athenaProductPage")
    if main_content:
        review_div = soup.find("div", class_="athenaProductPage_productReviews")
        if review_div:
            prod_review_div = review_div.find("div", class_="athenaProductReviews")
            if prod_review_div:
                if prod_review_div.find("div", class_="athenaProductReviews_empty"):
                    reviews.append((None, None, url))
                elif prod_review_div.find("div", class_="athenaProductReviews_summary"):
                    review_container = prod_review_div.find("div", class_="athenaProductReviews_summary_reviewContainer")
                    if review_container:
                        summ_cols = review_container.find("div", class_="athenaProductReviews_summary-columns")
                        if summ_cols:
                            summ_right = summ_cols.find("div", class_="athenaProductReviews_summary-right")
                            if summ_right:
                                top_review_div = summ_right.find("div", class_="athenaProductReviews_topReviews")
                                if top_review_div:
                                    review_containers = top_review_div.find_all("div", class_="athenaProductReviews_topReviewSingle")
                                    # Loop through each review container and extract the text
                                    for review_container in review_containers:
                                        review_text = review_container.find("p", class_="athenaProductReviews_topReviewsExcerpt").get_text(strip=True) if review_container.find("p", class_="athenaProductReviews_topReviewsExcerpt") else None
                                        review_stars = review_container.find("div", class_="athenaProductReviews_topReviewsRatingStarsContainer")['aria-label']
                                        reviews.append((review_text, review_stars, url))
    else:
        reviews.append((None, None, url))
    
    return reviews


In [40]:
def get_reviews(product_links):

    reviews = []
    reviews_to_return = []
    for prod_link in product_links:
        response = requests.get(prod_link)
        soup = BeautifulSoup(response.content, "html.parser")

        # navigate to reviews page
        try:
            reviews_url = soup.find(class_="athenaProductReviews_seeReviewsButton")['href']
        except TypeError:
            reviews_url = False

        if reviews_url:
            reviews = get_paginated_reviews(reviews_url)
        elif reviews_url == False:
            reviews = get_first_page_reviews(soup, prod_link)
        else:
            reviews.append((None, None, prod_link))
        
        reviews_to_return += reviews 

    return reviews_to_return

total_reviews = get_reviews(product_links)

In [41]:
review_text, stars, url = zip(*total_reviews)
df = pd.DataFrame({'stars': stars, 'review': review_text, 'url': url})
# select only number of stars
df['stars'] = df['stars'].str.split(" ").str[0]
# remove odd whitespaces
df['review'] = df['review'].str.strip().str.replace(r'\s+', ' ', regex=True).str.replace("\n","")
df['product_name'] = df['url'].str.split("/").str[-2]
df['review'].fillna('No review available', inplace=True)

In [42]:
def translate_to_english(review, target_language='en'):
    translator = Translator()
    translation = translator.translate(review, src='auto', dest=target_language)
    return translation.text

# Function to detect language and translate to English
def detect_and_translate(review):
    try:
        detected_language = detect(review)
        if detected_language != 'en':
            translated_review = translate_to_english(review)
            return detected_language, translated_review
        else:
            return detected_language, review
    except:
        return 'No language detected', review

# Apply language detection and translation to English
df[['detected_language', 'translated_review']] = df['review'].apply(lambda x: pd.Series(detect_and_translate(x)))

In [43]:
# Function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    # text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize text
    tokens = word_tokenize(text.lower())

    # Remove stop words
    list_stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in list_stop_words]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    return ' '.join(lemmatized_tokens)

df['processed_review'] = df['translated_review'].apply(preprocess_text)

In [44]:
def get_sentiment_score(text):
    if isinstance(text, str):
        sentiment = analyzer.polarity_scores(text)
        return sentiment
    else:
        # Handle non-string (float) cases, return np.nan
        return {'neg': None, 'neu': None, 'pos': None, 'compound': None}


analyzer = SentimentIntensityAnalyzer()
sentiment_scores = df['processed_review'].apply(get_sentiment_score)
df[['Negative', 'Neutral', 'Positive', 'Compound']] = pd.DataFrame(sentiment_scores.tolist())
df.head(3)

Unnamed: 0,stars,review,url,product_name,detected_language,translated_review,processed_review,Negative,Neutral,Positive,Compound
0,4,"Boa tarde Ainda não recebi a minha encomenda. Demasiado tempo , se soubesse que era assim tanto, tinha optado por outra solução mais rápida . Gosto dos vossos produtos , mas assim deixa de ser viável . Obrigado",https://www.myprotein.pt/sports-nutrition/impact-whey-isolate/10530911.reviews,impact-whey-isolate,pt,"Good afternoon I haven't received my order yet.Too much time, if I knew it was so much, I had opted for another faster solution.I like your products, but it is no longer viable.Thanks","good afternoon n't received order yet.too much time , knew much , opted another faster solution.i like product , longer viable.thanks",0.0,0.748,0.252,0.6597
1,5,"Numa situação normal a tomar whey todos os dias, para quanto tempo dá um saco 500g +/-? Primeira vez que vou comprar :/",https://www.myprotein.pt/sports-nutrition/impact-whey-isolate/10530911.reviews,impact-whey-isolate,pt,"In a normal situation to take Whey every day, how long does a 500g +/- bag give?First time I will buy:/","normal situation take whey every day , long 500g +/- bag give ? first time buy : /",0.0,1.0,0.0,0.0
2,5,"Sabor chocolate top. Whey de muito boa qualidade. Dissolve supe bem, muito bom sabor e acompanha bem aveia, nestum de arroz, skyr natural, etc… já experimentei muitas wheys de boas marcas mas esta whey faz a diferença. A nível nutricional, basta olhar a tabela do produto 👌🏻 recomendo a 200%",https://www.myprotein.pt/sports-nutrition/impact-whey-isolate/10530911.reviews,impact-whey-isolate,pt,"Top chocolate flavor.Whey of very good quality.Dissolves Super well, very good flavor and accompanies well oats, rice nestum, natural skyr, etc… I've tried a lot of whether good brands but this whey makes a difference.At the nutritional level, just look at the product table 👌🏻 I recommend to 200%","top chocolate flavor.whey good quality.dissolves super well , good flavor accompanies well oat , rice nestum , natural skyr , etc… 've tried lot whether good brand whey make difference.at nutritional level , look product table 👌🏻 recommend 200 %",0.0,0.514,0.486,0.9666


In [45]:
df.groupby(by="product_name")[['Negative', 'Neutral', 'Positive', 'Compound']].count().sort_values(by="Positive", ascending=False)

Unnamed: 0_level_0,Negative,Neutral,Positive,Compound
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
impact-whey-isolate,158,158,158,158
proteina-isolada-de-soja,68,68,68,68
proteina-de-ervilha-isolada,62,62,62,62
clear-whey-isolate,18,18,18,18
impact-whey-isolate-amostra,3,3,3,3
myprotein-clear-whey-isolate-sample,2,2,2,2
clear-whey-isolada-conjunto-de-saquetas-de-amostra,1,1,1,1
myprotein-impact-native-whey-isolate-sample,1,1,1,1
myvegan-proteina-de-ervilha-isolada,1,1,1,1
pack-inicial-clear-whey,1,1,1,1


In [46]:
df.groupby(by="product_name")[['Negative', 'Neutral', 'Positive', 'Compound']].mean().sort_values(by="Positive", ascending=False)

Unnamed: 0_level_0,Negative,Neutral,Positive,Compound
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
myprotein-clear-whey-isolate-sample,0.0,0.6055,0.3945,0.60855
impact-whey-isolate-amostra,0.0,0.607333,0.392667,0.660367
impact-whey-isolate,0.065848,0.579892,0.354247,0.579162
clear-whey-isolate,0.094278,0.559222,0.346444,0.613833
proteina-isolada-de-soja,0.079471,0.651118,0.269382,0.504632
proteina-de-ervilha-isolada,0.102532,0.628452,0.268952,0.421231
myvegan-proteina-de-ervilha-isolada,0.193,0.583,0.225,0.3097
clear-whey-isolada-conjunto-de-saquetas-de-amostra,0.0,1.0,0.0,0.0
myprotein-impact-native-whey-isolate-sample,0.0,1.0,0.0,0.0
pack-inicial-clear-whey,0.0,1.0,0.0,0.0
