# Main function of this code:

### 1. Scrape all reviews(in total 10777 piecies, this mumber might increase as time goes by) of the movie "The Shawnshank Redemption" from IMDb website and write them into a csv file(can be found in the file). Due to the loss of data, the actual number of reviews scraped is 10754 which is considerd enough for the later analysis. 

### 2. Scraped data includes: 
* review title, 
* spoiler warning tag, 
* content,
* rate(how many stars each review gives to the movie) , 
* vote(how many people vote for each review), 
* and date(year).

In [1]:
# import pakages
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium.common import NoSuchElementException
import csv

# Part 1 Load all reviews

### Load reviews by scrolling to the bottom and clicking the "Load More" button on the user review page.

### !!! Warning: The Shawshank Redemption has more than 10000 reviews), it may take around 40 minutes to run this code. To save time, you may consider the other two test cases: 
*https://www.imdb.com/title/tt6718170/reviews/?ref_=tt_ov_rt  (around 671 reviews)
*https://www.imdb.com/title/tt14403178/reviews/?ref_=tt_ov_rt  (around 218 reviews)
*https://www.imdb.com/title/tt0111161/reviews?ref_=tt_urv  (our user case-Shawshank Redemption: 10777 reviews)

In [2]:
# open the url using webdriver
driver = webdriver.Firefox()
driver.get("https://www.imdb.com/title/tt0111161/reviews?ref_=tt_urv")
print("Loading all reviews, please wait ...")

# funtion: scroll to the bottom of one page
def scroll_to_bottom():
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

# Whlie loop: scroll to the bottom of each page and click "Load More" button until the there is no more buttons
while True:  
    try:
        if 'firefox' in driver.capabilities['browserName']:  # Check if browser name is Firefox
            scroll_to_bottom()  # Call a function to scroll the page to the bottom

        load_more = driver.find_element(By.ID, "load-more-trigger")  # Find the "Load More" element by ID
        actions = ActionChains(driver)  # Create an instance of ActionChains to perform actions on the webpage
        actions.move_to_element(load_more)  # Move the mouse cursor to the "Load More" element
        actions.click()  # Click the "Load More" element
        actions.perform()  # Perform the actions (move and click)
        time.sleep(6)  # Pause the script for 3 seconds for page to load

    except Exception as e:  # Catch any exceptions
        print("Encountered an exception: " + str(e))  # Print the exception message
        break  # Exit the loop, stop the script


print("All reviews loaded successfully!")
time.sleep(15)

# Find all elements with class name "lister-item-content" to get all reviews
all_reviews = driver.find_elements(By.CLASS_NAME, "lister-item-content")

# Print the total amount of reviews found
print(f"The total amount of reviews is {len(all_reviews)}")

# Pause the script for 10 seconds to allow the page to fully load
time.sleep(15)

Loading all reviews, please wait ...
Encountered an exception: Message: Origin element is not displayed
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:182:5
MoveTargetOutOfBoundsError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:372:5
getOriginCoordinates@chrome://remote/content/marionette/action.sys.mjs:486:15
getTargetCoordinates@chrome://remote/content/marionette/action.sys.mjs:430:25
dispatch@chrome://remote/content/marionette/action.sys.mjs:1004:32
dispatch/pendingEvents<@chrome://remote/content/marionette/action.sys.mjs:1827:14
dispatch@chrome://remote/content/marionette/action.sys.mjs:1826:39
dispatch/chainEvents<@chrome://remote/content/marionette/action.sys.mjs:1753:27
dispatch@chrome://remote/content/marionette/action.sys.mjs:1755:7
performActions@chrome://remote/content/marionette/actors/MarionetteCommandsChild.sys.mjs:472:23
receiveMessage@chrome://remote/conten

# Part 2 Get all tags and write them into csv file

### We have load all the data, now we start to extract the specified elements from the page.

In [3]:
# Print a message indicating that writing reviews to a CSV file is starting
print("Writing all of the reviews to a CSV file...")

# Open a CSV file in write mode with UTF-8 encoding and no newlines
with open('full_reviews.csv', mode='w', encoding='utf-8', newline='') as file:
    # Create a CSV writer object
    writer = csv.writer(file)

    # Write the header row to the CSV file
    writer.writerow(['title', 'warning', 'rate', 'content', 'vote', 'year'])

    # Loop through each comment element and extract relevant information
    for review in all_reviews:
        """2.1 get warning tag for each review"""
        # Check if the comment has a "spoiler-warning" class, indicating a warning tag
        warning = False
        try:
            review.find_element(By.CLASS_NAME, "spoiler-warning")
            warning = True
        except NoSuchElementException:
            pass

        # Some long comments are shrinking, we need to click the expand button to make it expanded,
        # so we can extract the comment.
        try:
            review.find_element(By.CLASS_NAME, "ipl-expander").click()
        except NoSuchElementException:
            pass

        """2.2 get title for each review"""
        # Extract the title of the comment from the "title" class element
        title = review.find_element(By.CLASS_NAME, "title").text

        """2.3 get content for each review"""
        # Extract the content of the comment from the "content" class element
        content = review.find_element(By.CLASS_NAME, "content").find_element(By.TAG_NAME, "div").text
        
        """2.4 get rate for each review"""
        # Extract the rate of the comment from the "rating-other-user-rating" class element
        rate = -1
        try:
            rate = review.find_element(By.CLASS_NAME, "rating-other-user-rating").find_element(By.TAG_NAME,
                                                                                                "span").text
        except NoSuchElementException:
            pass

        """2.5 get date for each review"""
        # Extract the year from the "review-date" class element
        year = review.find_element(By.CLASS_NAME, "review-date").text.split(" ", 2)[2]

        """2.6 get vote for each review"""
        # Extract the vote count from the "actions.text-muted" class element
        vote = -1
        try:
            vote = review.find_element(By.CLASS_NAME, "actions.text-muted").text.split()[0]
        except NoSuchElementException:
            pass

        # Write the extracted information as a row to the CSV file
        writer.writerow([title, warning, rate, content, vote, year])

# Print a message indicating that writing comments to the CSV file is finished
print("Finished!")

Writing all of the reviews to a CSV file...
Finished!
