In [3]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd

In [2]:
## set the chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=chrome_options)
comments = pd.DataFrame(columns = ['Product_name','Product_review','User_rating'])

base_url = "https://www.beeradvocate.com"

## navigate to the main page
driver.get("https://www.beeradvocate.com/beer/top-rated/")
## prase the content
soup = BeautifulSoup(driver.page_source,'html.parser')

## extract the top 250 beer links
beer_links = [link['href'] for index, link in enumerate(soup.select('td a[href^="/beer/profile/"]')) if index % 2 == 0]

## scrap reviews
for beer_link in beer_links[:250]:
    ## load the beer page
    driver.get(base_url + beer_link)
    beer_soup = BeautifulSoup(driver.page_source,'html.parser')

    ## get the product name
    product_name = beer_soup.h1.contents[0].strip()

    ## scrap review
    reviews = driver.find_elements(By.XPATH,'//*[@id="rating_fullview_content_2"]')

    review_count = 0

    for review in reviews:
        if review_count >= 22:
            break
        try:
            user_rating = review.find_element(By.XPATH, './/span[2]').text
            product_review = review.find_element(By.XPATH, './/div').text
        except:
            user_rating = review.find_element(By.XPATH, './/span[1]/b[1]').text
            product_review = ""

        ## append data into the dataset
        new_row = pd.DataFrame([{'Product_name':product_name,
                                 'User_rating':user_rating,
                                 'Product_review':product_review}])
        comments = pd.concat([comments, new_row], ignore_index=True)

    driver.back()

comments



MaxRetryError: ignored

**Task B:**

In [3]:
from google.colab import files
uploaded = files.upload()

Saving Beer_review.csv to Beer_review.csv


In [4]:
comments = pd.read_csv('Beer_review.csv').dropna()
comments['Product_review'] = comments['Product_review'].apply(lambda x: str(x).replace('\n',' '))
comments

Unnamed: 0,Product_name,Product_review,User_rating
1,Kentucky Brunch Brand Stout,"Amazing stout, expensive but worth the price!",5.00
4,Kentucky Brunch Brand Stout,This beer meet and exceeded all the unreal hyp...,5.00
6,Kentucky Brunch Brand Stout,Let it sit and warm to the room a little ... u...,5.00
7,Kentucky Brunch Brand Stout,"A small pour, ebony dark, no real head. Intens...",4.67
14,Kentucky Brunch Brand Stout,"Vintage 2022, served on tap at Toppling Goliat...",5.00
...,...,...,...
4981,Samuel Adams Utopias,2017 vintage - shared with me from a friend. S...,4.45
4983,Samuel Adams Utopias,"I opened a 2015 bottle about a month ago, this...",4.65
4990,Samuel Adams Utopias,2019 vintage Pours a still dark mahogany ambe...,4.76
4992,Samuel Adams Utopias,Had multiple years. Very unique and a must try...,4.50


In [9]:
import spacy
from collections import Counter

nlp = spacy.load("en_core_web_sm")
beer_attributes = []
for review in comments['Product_review']:
    doc = nlp(review.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    beer_attributes.extend(tokens)


OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [6]:

# Calculate word frequencies
word_frequencies = Counter(beer_attributes)
sorted_word_frequencies = dict(sorted(word_frequencies.items(), key=lambda item: item[1], reverse=True))
sorted_words_df = pd.DataFrame(sorted_word_frequencies.items(),columns= ['Word','Frequency'])

sorted_words_df.head(30)

NameError: name 'Counter' is not defined

**Task C:**

In [37]:
uploaded = files.upload()

Saving customer_attributes.txt to customer_attributes.txt


In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

userInput = "stout"

def beer_cosine_similarity(userInput, reviews):
  similarities = []
  for review in reviews:
    compare_list = [userInput,review]
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(compare_list)
    similarity_value = cosine_similarity(tfidf_matrix)[0,1]
    similarities.append(similarity_value)

  return similarities

with open('customer_attributes.txt', 'r') as attr_file:
    customer_attributes = [attr.strip() for attr in attr_file.readlines()]
    userInput = ' '.join(customer_attributes)


comments['Similarity_Score'] = beer_cosine_similarity(userInput,comments['Product_review'])


In [39]:
output = comments[['Product_name', 'Product_review', 'Similarity_Score']]
output.to_csv('Similarity_Scores.csv', index=False)

files.download('Similarity_Scores.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Task D & E**

In [1]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer


sia = SentimentIntensityAnalyzer()
newWords = {'beer': 0, 'dark': 2.0, 'light': -2.0, 'coffee': -2.0, 'orange': -2.0}
sia.lexicon.update(newWords)
sentiment_scores = []

for review in comments['Product_review']:
    sentiment = sia.polarity_scores(review)
    sentiment_scores.append(sentiment.get('compound'))

comments['Sentiment_Scores']=sentiment_scores
comments['Evaluation_Scores'] = comments['Sentiment_Scores']*comments['Similarity_Score']




[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\zjnbl\AppData\Roaming\nltk_data...


NameError: name 'comments' is not defined

In [48]:
beer_evaluation_scores = comments.groupby('Product_name')['Evaluation_Scores'].mean().reset_index()
beer_evaluation_scores = beer_evaluation_scores.sort_values(by='Evaluation_Scores', ascending=False)

print('Recommended top 3 beers are:\n', '\n'.join(beer_evaluation_scores.head(3)['Product_name']))

Recommended top 3 beers are:
 Affogato - Bourbon Barrel-Aged
Double Barrel Jesus
Impermanence


In [42]:
beer_evaluation_scores.to_csv('Evaluation_Scores.csv', index=False)
files.download('Evaluation_Scores.csv')



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>