In [22]:
# web scraping libraries
import requests
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# text preprocessing libraries 
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# other
import pandas as pd
import os

In [23]:
path = os.path.join(os.path.normpath(os.getcwd() + os.sep + os.pardir) + "/data/raw/")

print(os.listdir(path))

movies = pd.read_csv(path + 'movies.csv')

movies

['movies.csv']


Unnamed: 0.1,Unnamed: 0,title,link
0,0,L.A. Confidential,https://www.rottentomatoes.com/m/la_confidenti...
1,1,The Godfather,https://www.rottentomatoes.com/m/the_godfather...
2,2,Casablanca,https://www.rottentomatoes.com/m/1003707-casab...
3,3,Seven Samurai,https://www.rottentomatoes.com/m/seven_samurai...
4,4,Parasite,https://www.rottentomatoes.com/m/parasite_2019...
...,...,...,...
394,394,In the Name of the King: A Dungeon Siege Tale,https://www.rottentomatoes.com/m/in-the-name-o...
395,395,The Cold Light of Day,https://www.rottentomatoes.com/m/the_cold_ligh...
396,396,A Low Down Dirty Shame,https://www.rottentomatoes.com/m/low_down_dirt...
397,397,Just Getting Started,https://www.rottentomatoes.com/m/just_getting_...


In [24]:
# where the high-level magic happens
def url_scraper_main(url, movie):
    soup = load_html(url)
    reviews_df = soup_extractor(soup, movie)
    return reviews_df

#print(url_scraper_main("https://www.rottentomatoes.com/m/aquaman_2018/reviews", "aqua_man"))

In [25]:
#url = "https://www.rottentomatoes.com/m/aquaman_2018/reviews"

def load_html(url):
    driver = webdriver.Chrome()
    driver.get(url)

    num_reviews = 30
    num_reviews_per_load = 20
    try:
        while True:
            #print("iterated")
            # Wait until the "Load More" button is present
            load_more_button = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, '//*[@id="reviews"]/div[2]/rt-button'))
            )
            # Click the "Load More" button
            load_more_button.click()
            # Optionally, wait for new content to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, f'//*[@id="reviews"]/div[1]/div[{num_reviews}]'))
            )
            num_reviews += num_reviews_per_load
    except Exception as e:
        print("Finished Loading")
        #print("No more 'Load More' buttons to click or an error occurred:", e)

    response = driver.page_source
    soup = BeautifulSoup(response, 'html.parser')

    driver.quit()
    return soup

In [26]:
# Extract sentiment and review 

def soup_extractor(soup, movie):
    reviews = soup.find_all('div', class_='review-row')
    reviews_list = []

    for review in reviews:
        text = review.find('p', class_='review-text')
        text = text.text.strip()
        text = clean(text)
        sentiment = review.find('score-icon-critics')
        sentiment = sentiment.get('sentiment')
        reviews_list.append((movie, sentiment, text))

    reviews_df = pd.DataFrame(reviews_list, columns=["movie","sentiment","review_text"])

    return reviews_df


In [27]:
# need to clean up the words, then remove the most frequent words encountered/associated with a movie?
# what is the minimum percentage?

In [28]:
def clean(sample_text):
    sample_text = sample_text.lower()
    sample_text = sample_text.translate(str.maketrans('','', string.punctuation)) 
    # Consider using regex to address back to back punctuation, like day-to-day
    
    sample_text = word_tokenize(sample_text)
    lemmatizer = WordNetLemmatizer()
    sample_text = [lemmatizer.lemmatize(word) for word in sample_text]

    #consider changing this later, to take care of bi grams, where words such as 'not' may be important
    stop_words = set(stopwords.words('english'))
    sample_text = [word for word in sample_text if word not in stop_words]

    return sample_text



In [29]:
reviews_df = pd.DataFrame(columns=["movie","sentiment","review"])

for row in movies.itertuples(name='Pandas'):
    movie = row.title
    url = row.link
    review_df = url_scraper_main(url, movie)
    reviews_df = pd.concat([reviews_df, review_df], axis=0)

Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loading
Finished Loadi

In [37]:
reviews_df

back_up = reviews_df
back_up = back_up.drop(columns=['review'])
back_up.to_csv(path + 'movie_reviews.csv', header=True)

In [30]:
'''
total_words = 0
word_counter = pd.Series()

for review in reviews_df["review_text"]:
    for word in review:
        total_words += 1
        if word in word_counter.keys():
            word_counter[word] += 1
        else:
            word_counter[word] = 1

word_counter = word_counter.sort_values(ascending=False)
word_counter = 100*word_counter/total_words
'''

#'''

'\ntotal_words = 0\nword_counter = pd.Series()\n\nfor review in reviews_df["review_text"]:\n    for word in review:\n        total_words += 1\n        if word in word_counter.keys():\n            word_counter[word] += 1\n        else:\n            word_counter[word] = 1\n\nword_counter = word_counter.sort_values(ascending=False)\nword_counter = 100*word_counter/total_words\n'

In [31]:
#1.2% seems like the cutoff rate for top words
#also cutoff words that appear too infrequently... maybe?

In [32]:
'''Word_Ratings = pd.DataFrame(columns=["POSITIVE","NEGATIVE"])

def vectorise_rating(row):

    rate = "POSITIVE"

    for word in sample_text: 
        if word in list(Word_Ratings.index):
            Word_Ratings[rate].loc[word] += 1
        else:
            Word_Ratings.loc[word] = [0,0]
            Word_Ratings[rate].loc[word] += 1


Word_Ratings
'''
#Score is (Positive - Negative) / (Positive + Negative + 1)

'Word_Ratings = pd.DataFrame(columns=["POSITIVE","NEGATIVE"])\n\ndef vectorise_rating(row):\n\n    rate = "POSITIVE"\n\n    for word in sample_text: \n        if word in list(Word_Ratings.index):\n            Word_Ratings[rate].loc[word] += 1\n        else:\n            Word_Ratings.loc[word] = [0,0]\n            Word_Ratings[rate].loc[word] += 1\n\n\nWord_Ratings\n'