In [10]:
# web scraping libraries
import requests
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# text preprocessing libraries 
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# other
import pandas as pd
import os

In [11]:
# where the high-level magic happens
def url_scraper_main(url, movie):
    soup = load_html(url)
    reviews_df = soup_extractor(soup, movie)
    return reviews_df


In [12]:

def load_html(url):
    driver = webdriver.Chrome()
    driver.get(url)

    num_reviews = 30
    num_reviews_per_load = 20
    try:
        while True:
            #print("iterated")
            # Wait until the "Load More" button is present
            load_more_button = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, '//*[@id="reviews"]/div[2]/rt-button'))
            )
            # Click the "Load More" button
            load_more_button.click()
            # Optionally, wait for new content to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, f'//*[@id="reviews"]/div[1]/div[{num_reviews}]'))
            )
            num_reviews += num_reviews_per_load
    except Exception as e:
        print("Finished Loading")
        #print("No more 'Load More' buttons to click or an error occurred:", e)

    response = driver.page_source
    soup = BeautifulSoup(response, 'html.parser')

    driver.quit()
    return soup

In [13]:
# Extract sentiment and review 

def soup_extractor(soup, movie):
    reviews = soup.find_all('div', class_='review-row')
    reviews_list = []

    for review in reviews:
        text = review.find('p', class_='review-text')
        text = text.text.strip()
        sentiment = review.find('score-icon-critics')
        sentiment = sentiment.get('sentiment')
        reviews_list.append((movie, sentiment, text))

    reviews_df = pd.DataFrame(reviews_list, columns=["movie","sentiment","review_text"])

    return reviews_df


In [14]:
# need to clean up the words, then remove the most frequent words encountered/associated with a movie?
# what is the minimum percentage?

In [15]:
def clean(sample_text):
    sample_text = sample_text.lower()
    sample_text = sample_text.translate(str.maketrans('','', string.punctuation)) 
    # Consider using regex to address back to back punctuation, like day-to-day
    
    sample_text = word_tokenize(sample_text)
    lemmatizer = WordNetLemmatizer()
    sample_text = [lemmatizer.lemmatize(word) for word in sample_text]

    #consider changing this later, to take care of bi grams, where words such as 'not' may be important
    stop_words = set(stopwords.words('english'))
    sample_text = [word for word in sample_text if word not in stop_words]

    return sample_text



In [16]:
movie_info = [("aquaman_2018", "https://www.rottentomatoes.com/m/aquaman_2018/reviews"),
              ("morbius", "https://www.rottentomatoes.com/m/morbius/reviews"),
              ("harry_potter_and_the_sorcerers_stone", "https://www.rottentomatoes.com/m/harry_potter_and_the_sorcerers_stone/reviews"),
              ("captain_marvel", "https://www.rottentomatoes.com/m/captain_marvel/reviews"),
              ("fifty_shades_of_grey", "https://www.rottentomatoes.com/m/fifty_shades_of_grey/reviews")]
tests_df = pd.DataFrame(columns=["movie","sentiment","review"])

for row in movie_info:
    movie = row[0]
    url = row[1]
    test_df = url_scraper_main(url, movie)
    tests_df = pd.concat([tests_df, test_df], axis=0)

Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading


In [18]:
path = os.path.join(os.path.normpath(os.getcwd() + os.sep + os.pardir) + "/data/")

print(os.listdir(path))

tests_df = tests_df.drop(columns=['review'])
tests_df.to_csv(path + 'test/test.csv', header=True)

['cleaned', 'raw', 'test']


In [19]:
tests_df

Unnamed: 0,movie,sentiment,review_text
0,aquaman_2018,POSITIVE,It’s a comic-fantasy that gets a passing grade...
1,aquaman_2018,POSITIVE,Aquaman has some truly original imagery and a ...
2,aquaman_2018,POSITIVE,"Aquaman is the fun action-packed, adventure mo..."
3,aquaman_2018,POSITIVE,The production and attention to detail here is...
4,aquaman_2018,POSITIVE,“Aquaman” is an oddly satisfying blast. It’s n...
...,...,...,...
275,fifty_shades_of_grey,POSITIVE,"Glossy, well cast, and a consistent hoot until..."
276,fifty_shades_of_grey,POSITIVE,"Accept this genuinely sexy, somewhat silly, re..."
277,fifty_shades_of_grey,POSITIVE,Fifty Shades of Grey is a sex-positive but hop...
278,fifty_shades_of_grey,POSITIVE,"Aiming to please, the filmmakers submit withou..."
