In [None]:
#Team Members:
#Jason Antal, Ari Pai, Mykyta Zavhorodko, Albert Nguyen, Andrew White
#11am-1pm Cohort

In [None]:
#Part A, extract data

#requires manually installing files found at https://github.com/mozilla/geckodriver/releases
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import csv
import time
import random

# Fetch beer links from the page
def get_beer_links(url):
    #CHANGE BROWSER HERE AND BELOW IF NEEDED
    firefox_options = Options()
    firefox_options.add_argument("--headless")
    service = Service(r"C:\Users\jason\Downloads\geckodriver-v0.35.0-win64\geckodriver.exe")  # UPDATE FILEPATH HERE
    driver = webdriver.Firefox(service=service, options=firefox_options)
    driver.set_page_load_timeout(60000)
    driver.get(url)
    links = []
    # Extract all links
    td_elements_with_links = driver.find_elements(By.XPATH, "//td[a]")
    for td in td_elements_with_links:
        a_tag = td.find_element(By.TAG_NAME, 'a')
        href = a_tag.get_attribute('href')
        links.append(href)
    driver.quit()
    return links

# Scrape reviews from each individual page using links
def scrape_review(url):
    # CHANGE BROWSER HERE AND ABOVE IF NEEDED
    firefox_options = Options()
    firefox_options.add_argument("--headless")
    service = Service(r"C:\Users\jason\Downloads\geckodriver-v0.35.0-win64\geckodriver.exe")  # UPDATE FILEPATH HERE
    driver = webdriver.Firefox(service=service, options=firefox_options)
    driver.set_page_load_timeout(6000)  # Set timeout to 100 minutes
    driver.get(url)

    #Get beer name and brewery name
    try:
        title_bar = driver.find_element(By.XPATH, "//div[@class='titleBar']")
        beer_name = title_bar.find_element(By.XPATH, ".//h1").text.strip()
        brewery_name = title_bar.find_element(By.XPATH, ".//span").text.strip()
    except NoSuchElementException:
        beer_name = "N/A"
        brewery_name = "N/A"

    reviews = []
    # Find review containers
    review_divs = driver.find_elements(By.XPATH, "//div[@class='user-comment']")
    for review in review_divs:
        try:
            # Get the rating
            try:
                rating_span = review.find_element(By.XPATH, ".//span[contains(@class, 'BAscore_norm')]")
                rating = float(rating_span.text.strip()) if rating_span.text else None
            except NoSuchElementException:
                rating = 'N/A'
            # Get the review text
            try:
                review_body = review.find_element(By.XPATH, ".//div[@style='margin:20px 0px; font-size:11pt; line-height:1.4;']")
                text = review_body.text.strip() if review_body else ''
            except NoSuchElementException:
                text = ''
            reviews.append({
                'beer name': beer_name,
                'brewery name': brewery_name,
                'rating': rating,
                'text': text
            })
        except Exception as e:
            print(f"Error while scraping review: {e}")
    driver.quit()
    return reviews
# Calls both beerlinks and scraper, then outputs into csv
def main():
    base_url = 'https://www.beeradvocate.com/beer/top-rated/'
    csv_filename = 'BeerAdvocate_Reviews.csv'
    total_reviews = 0
    max_reviews = 5000  # Adjust as needed, comment this and its corresponding code out to speed up the script if you want to extract all reviews on the page
    # Call get_beer_links
    beer_links = get_beer_links(base_url)
    # CSV
    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['beer name', 'brewery name', 'rating', 'text']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        # Scrape reviews for each beer link
        for link in beer_links:
            if total_reviews >= max_reviews: #Comment this out if needed
                break
            reviews = scrape_review(link)
            for review in reviews:
                writer.writerow(review)
                total_reviews += 1 #Comment this out if needed
                print(f"Total reviews scraped: {total_reviews}")
    print(f"Scraping completed. Total reviews collected: {total_reviews}")
# Execute script
main()

In [None]:
# Part B, Word frequency analysis and lift analysis

import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from itertools import combinations
import ast
import seaborn as sns

# Change filepath here if necessary
df = pd.read_csv(r"BeerAdvocate_New.csv")

# Convert the 'text' column to list
df['text'] = df['text'].apply(ast.literal_eval)

# Word frequency analysis
stopwords = {'note', 'taste', 'head', 'beer', 'like', 'one', 'overall', 'nose', 'carbonation', 'notes'} #unhelpful stop words removed here
all_words = [word for review in df['text'] for word in review if word not in stopwords]
word_freq = Counter(all_words)

# Plot the most common words, I chose to display 20 here
plt.figure(figsize=(10, 5))
word_freq.most_common(20)[::-1]
plt.barh(*zip(*word_freq.most_common(20)))
plt.title('Top 20 Most Common Words in Beer Reviews')
plt.xlabel('Frequency')
plt.show()

# Select top attributes to calculate lift values, I chose to display 12 here
top_attributes = [word for word, _ in word_freq.most_common(12)]

# Function to check if an attribute is present in a review
def has_attribute(review, attribute):
    return int(attribute in review)

# Create columns for each attribute
for attr in top_attributes:
    df[attr] = df['text'].apply(lambda x: has_attribute(x, attr))

# Lift analysis
def calculate_lift(df, attr1, attr2):
    total_reviews = len(df)
    support_attr1 = df[attr1].sum() / total_reviews
    support_attr2 = df[attr2].sum() / total_reviews
    support_both = ((df[attr1] == 1) & (df[attr2] == 1)).sum() / total_reviews

    if support_attr1 * support_attr2 > 0:
        return support_both / (support_attr1 * support_attr2)
    else:
        return 0

# Calculate lift for all pairs of attributes
lift_matrix = pd.DataFrame(index=top_attributes, columns=top_attributes, dtype = float)

for attr1, attr2 in combinations(top_attributes, 2):
    lift = calculate_lift(df, attr1, attr2)
    lift_matrix.loc[attr1, attr2] = lift
    lift_matrix.loc[attr2, attr1] = lift

for attr in top_attributes:
    lift_matrix.loc[attr, attr] = 1.0

print(lift_matrix)
plt.figure(figsize=(12, 10))
sns.heatmap(lift_matrix, annot=True, cmap='YlGnBu', fmt='.2f')
plt.title('Lift Analysis of Top 12 Beer Attributes')
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\jason\\Downloads\\Cleaned Beer Text.csv'

Part C

In [None]:
# Clean the data
def clean_text(text):
    """
    Cleans a given text by removing punctuation, converting it to lowercase,
    and tokenizing it, ignoring any stopwords.
    """
    # Remove punctuation and convert text to lowercase
    sentence = re.sub(f'[{re.escape(string.punctuation)}]', '', text.lower())

    # Tokenize and remove stopwords
    return ' '.join([word for word in sentence.split() if word not in stopwords])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re

stopwords = nltk.corpus.stopwords.words('english')
cleaned_beer = pd.read_csv('BeerAdvocate_Reviews2.csv')
cleaned_beer = cleaned_beer.dropna()
cleaned_beer['text'] = cleaned_beer['text'].apply(str)
cleaned_beer['text'] = cleaned_beer['text'].apply(clean_text)

# Input the product attributes
attributes = []
file_name = ''

try:
    with open(file_name, 'r', encoding='utf-8') as f:
        attributes = [l.rstrip() for l in f]
except:
    three_atts = str(input("Enter three attributes comma separated."))
    attributes = re.findall('\w+', three_atts)
    print(attributes)

# Reviews
reviews = cleaned_beer['text'].tolist()

# Add the document as a "review"
reviews.append(' '.join(attributes))

similarity_scores = []

# Create a countvectorizer element on the reviews
vectorizer = CountVectorizer(stop_words=stopwords)
X = vectorizer.fit_transform(reviews)

# Normalize the vectors
scaler = MaxAbsScaler()
X_trans = scaler.fit_transform(X)

# Take the last row as the reference point
attrs_vector = X_trans[-1]

for review_vector in X_trans:
    # Calculate the similarity score
    similarity_scores.append(cosine_similarity(review_vector, attrs_vector)[0][0])

cleaned_beer['similiarity'] = similarity_scores[:-1]

Part D

In [None]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

'''# Load the CSV file
file_path = 'BeerAdvocate_Reviews2.csv'  # Replace with your actual file path
df = pd.read_csv(file_path)'''

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to get sentiment score using VADER
def get_vader_sentiment(review):
    if pd.isna(review):
        return None  # Return None for NaN reviews
    return analyzer.polarity_scores(review)['compound']

# Apply the sentiment analysis and create a new column for the sentiment scores, including NaN for NULL reviews
cleaned_beer['sentiment_score'] = clean_beer['text'].apply(get_vader_sentiment)

# Save the updated DataFrame with the sentiment scores
'''output_path = 'BeerAdvocate_Reviews_with_VADER_Sentiment.csv'  # Specify the desired output path
df.to_csv(output_path, index=False)'''

ModuleNotFoundError: No module named 'vaderSentiment'

Part E

In [None]:
# Calculate the evaluation score for each review
# Multiply cosine similarity and sentiment score to get evaluation score
cleaned_beer['evaluation_score'] = cleaned_beer['similiarity'] * cleaned_beer['sentiment_score']

# Group by brand and calculate the average evaluation score
beer_scores = cleaned_beer.groupby('beer name')['evaluation_score'].mean().reset_index()

# Sort and select the top 3 brands
top_3_beers = beer_scores.sort_values(by='evaluation_score', ascending=False).head(3)
print(top_3_beers)

Part F

In [None]:
!python -m spacy download en_core_web_md

import re
import spacy
nlp = spacy.load("en_core_web_md")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import Counter

df = pd.read_csv('/BeerAdvocate_Reviews.csv').dropna()
df['beer name'] = df['beer name'].str.split('\n').str[0]

#filtering the comments from special symbols, stop words, and punctuation
non_attribute = ['beer','even','ipa','good','great','near']

df['text_filtered'] = df['text'].apply(lambda x: x.lower())

#removing all punctuation and special symbols
df['text_filtered'] = df['text_filtered'].str.replace(r'[^\w\s]', '', regex=True)

# Replace newline characters with spaces
df['text_filtered'] = df['text_filtered'].str.replace('\n', ' ', regex=False)

# Remove stopwords and non-characteristic words
stop_words = set(stopwords.words('english'))
df['text_filtered'] = df['text_filtered'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words and word not in non_attribute]))

words = ["bold, strong, dark"] #################################################THESE ARE YOUR WORDS

words_doc = nlp(" ".join(words))

def compute_similarity(text):
    doc = nlp(text)
    return words_doc.similarity(doc)

df['spacy similarity'] = df['text_filtered'].apply(compute_similarity)
df.to_csv('BeerAdvocate_Reviews.csv', index=False)

Part G

In [None]:
# Calculate the evaluation score for each review
# Multiply cosine similarity and sentiment score to get evaluation score
cleaned_beer['evaluation_score'] = cleaned_beer['similiarity'] * cleaned_beer['sentiment_score']

# Group by brand and calculate the average evaluation score
beer_scores = cleaned_beer.groupby('beer name')['evaluation_score'].mean().reset_index()

# Sort and select the top 3 brands
top_3_beers = beer_scores.sort_values(by='evaluation_score', ascending=False).head(3)
print(top_3_beers)

In [None]:
top_3_beer_names = top_3_beers['beer name']

top_3_beer_data = cleaned_beer[cleaned_beer['beer name'].isin(top_3_beer_names)]

average_ratings_top_3 = top_3_beer_data.groupby('beer name')['rating'].mean().reset_index()

print(average_ratings_top_3) # Gives the average rating for each of the highest similarity/sentiment beers

In [None]:
beer_ratings = cleaned_beer.groupby('beer name')['rating'].mean().reset_index()
top_3_ratings = beer_ratings.sort_values(by='rating', ascending=False).head(3)
print(top_3_ratings) # Gives the average rating for each of the highest Rated beers

In [None]:
# Merge
joined_clean_beer = pd.merge(cleaned_beer, df['spacy similarity'], left_index=True, right_index=True)
joined_clean_beer['spacy_evaluation_score'] = joined_clean_beer['spacy similarity'] * joined_clean_beer['sentiment_score']

joined_clean_beer.groupby('beer name')[['spacy_evaluation_score', 'evaluation_score', 'rating']].mean().sort_values(by='spacy_evaluation_score', ascending=False)[:3]

In [None]:
# For the highest rated beers, what were their spacy similarity scores?
top3_ratings_beers = top_3_ratings['beer name'].tolist()
cleaned_beer_spacy.loc[top3_ratings_beers]

Part H

In [None]:
!python -m spacy download en_core_web_md
import re
import spacy
nlp = spacy.load("en_core_web_md")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import Counter

df = pd.read_csv('BeerAdvocate_New.csv').dropna()
df['beer name'] = df['beer name'].str.split('\n').str[0]

#filtering the comments from special symbols, stop words, and punctuation
non_attribute = ['beer','even','ipa','good','great','near', 'one', 'overall', 'i', 've', '2022', 'taste', 'bottle', 'bit', 'abv']

df['text_filtered'] = df['text'].apply(lambda x: x.lower())

#removing all punctuation and special symbols
df['text_filtered'] = df['text_filtered'].str.replace(r'[^\w\s]', '', regex=True)

# Replace newline characters with spaces
df['text_filtered'] = df['text_filtered'].str.replace('\n', ' ', regex=False)

# Remove stopwords and non-characteristic words
stop_words = set(stopwords.words('english'))
df['text_filtered'] = df['text_filtered'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words and word not in non_attribute]))

#word counts for every beer
# Step 1: Group by 'beer name' and aggregate the 'text_filtered' column
grouped = df.groupby('beer name')['text_filtered'].apply(lambda x: ' '.join(x)).reset_index()

# Step 2: Define a function to count the word frequencies for each beer
def count_words(text):
    words = text.split()  # Split the text into individual words
    word_counts = Counter(words)  # Count the occurrences of each word
    sorted_word_counts = dict(word_counts.most_common())  # Sort by frequency
    return sorted_word_counts

# Step 3: Apply the word count function to the 'text_filtered' column
grouped['word_counts'] = grouped['text_filtered'].apply(count_words)

# Step 4: Create a new dataframe that contains each beer and the sorted word counts
beer_word_counts_df = grouped[['beer name', 'word_counts']]

# Step 1: Group by 'beer name' and collect all 'text_filtered' comments for each beer
grouped = df.groupby('beer name')['text_filtered'].apply(list).reset_index()

# Step 2: Define a function to calculate the percentage of comments containing each word
def word_percentage_in_comments(comments):
    total_comments = len(comments)  # Total number of comments for the beer
    word_in_comments = Counter()  # To count how many comments contain each word

    for comment in comments:
        words_in_comment = set(comment.split())  # Unique words in each comment
        word_in_comments.update(words_in_comment)  # Increment count for each word

    # Calculate the percentage of comments that include each word
    word_percentages = {word: (count / total_comments) * 100 for word, count in word_in_comments.items()}

    # Sort by percentage in descending order
    sorted_word_percentages = dict(sorted(word_percentages.items(), key=lambda item: item[1], reverse=True)[:5]) #you choose how many words to include in a vector

    return sorted_word_percentages

# Step 3: Apply the percentage calculation function to the 'text_filtered' column
grouped['word_percentages'] = grouped['text_filtered'].apply(word_percentage_in_comments)

# Step 4: Add a column with the number of comments for each beer
grouped['comment_count'] = grouped['text_filtered'].apply(len)


# Step 5: Create a function to get the composite vector for a beer based on its word_percentages
def get_beer_doc(word_percentages):
    words = ' '.join(word_percentages.keys())  # Create a single string with the beer's most common words
    doc = nlp(words)  # Convert the string into a spaCy Doc object
    return doc

# Step 6: Apply the function to get a spaCy Doc for each beer
grouped['beer_doc'] = grouped['word_percentages'].apply(get_beer_doc)

# Step 7: Create a function to get the vector for the user's input
def get_user_input_doc(attributes):
    return nlp(' '.join(attributes))  # Convert the user's input words into a spaCy Doc object

######################################################################################
user_attributes = ['orange', 'head', 'carbonation', 'light', 'bitterness'] #attributes
######################################################################################

# Step 8: Example user input (three attributes)
user_doc = get_user_input_doc(user_attributes)

# Step 9: Calculate similarity using spaCy's built-in similarity method
grouped['similarity_score'] = grouped['beer_doc'].apply(lambda x: x.similarity(user_doc))

# Step 10: Sort beers by similarity score to recommend the most relevant ones
recommended_beers = grouped.sort_values(by='similarity_score', ascending=False)

names = ['Emperor Julius','Swish', 'I Let My Tape Rock', 'Trappistes Rochefort 10', 'Coffee Cinnamon B-Bomb', 'Pseudo Sue - Double Dry-Hopped', 'Mocha Wednesday',
'All That Is And All That Ever Will Be',
'Notorious Triple IPA',
'Vanilla Bean Assassin',
'Parabola'
]
recommended_beers[recommended_beers['beer name'].isin(names)][["beer name", "comment_count", "beer_doc", "similarity_score"]]

