In [23]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

# Specify the direct path to the downloaded ChromeDriver
chrome_driver_path = r'C:\Users\User\Downloads\chromedriver-win32\chromedriver-win32\chromedriver.exe'

# Create a Service object using the path to ChromeDriver
service = Service(executable_path=chrome_driver_path)

# Initialize the Chrome WebDriver with the service
driver = webdriver.Chrome(service=service)


In [21]:


# URLs of the IMDb movie reviews pages
movie_urls = {
    "The Matrix": "https://www.imdb.com/title/tt0133093/reviews?ref_=tt_urv",
    "The Notebook": "https://www.imdb.com/title/tt0332280/reviews?ref_=tt_urv",
    "The Social Network": "https://www.imdb.com/title/tt1285016/reviews?ref_=tt_urv",
    "Notting Hill": "https://www.imdb.com/title/tt0125439/reviews?ref_=tt_urv"
}

# Initialize WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# DataFrame to store reviews
reviews_df = pd.DataFrame(columns=['Movie Name', 'Review'])

# Process each movie
for movie_name, url in movie_urls.items():
    driver.get(url)
    time.sleep(5)  # Wait for the initial reviews to load
    
    # Attempt to click "Load More" button until all reviews are loaded or no more button to click
    while True:
        try:
            load_more_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.CLASS_NAME, "ipl-load-more__button"))
            )
            load_more_button.click()
            time.sleep(3)  # Wait for more reviews to load
        except TimeoutException:
            print(f"All reviews loaded for {movie_name}")
            break

    # After loading all reviews, scrape them
    review_elements = driver.find_elements(By.CLASS_NAME, "text.show-more__control")
    reviews = [review.text for review in review_elements]
    
    # Add to DataFrame
    temp_df = pd.DataFrame({'Movie Name': [movie_name] * len(reviews), 'Review': reviews})
    reviews_df = pd.concat([reviews_df, temp_df], ignore_index=True)

# Save the reviews to a CSV file
reviews_df.to_csv("imdb_reviews.csv", index=False)

# Clean up by closing the WebDriver
driver.quit()

print("Scraping complete. Reviews saved to imdb_reviews.csv.")


All reviews loaded for The Matrix
All reviews loaded for The Notebook
All reviews loaded for The Social Network
All reviews loaded for Notting Hill
Scraping complete. Reviews saved to imdb_reviews.csv.


In [8]:
# Read the csv file
reviews_df = pd.read_csv("imdb_reviews.csv")

# Group by 'Movie Name' and count the number of reviews for each movie
review_counts = reviews_df.groupby('Movie Name').size()

# Display the count of reviews for each movie
print("Number of reviews scraped for each movie:")
print(review_counts)

# Display the total number of reviews
total_reviews = reviews_df.shape[0]
print(f"\nTotal number of reviews scraped: {total_reviews}")


Number of reviews scraped for each movie:
Movie Name
Notting Hill           806
The Matrix            4971
The Notebook          1419
The Social Network    1073
dtype: int64

Total number of reviews scraped: 8269


In [9]:

# Load reviews data
reviews_df = pd.read_csv('imdb_reviews.csv')

# Drop rows with NaN values in 'Review' to ensure they are not counted
reviews_df.dropna(subset=['Review'], inplace=True)

# Count unique reviews for each movie
unique_review_counts = reviews_df.groupby('Movie Name')['Review'].nunique()

# Print the count of unique reviews for each movie
print(unique_review_counts)


Movie Name
Notting Hill           722
The Matrix            4419
The Notebook          1085
The Social Network     840
Name: Review, dtype: int64


In [39]:
reviews_df[reviews_df.duplicated(subset=['Review'],keep=False)] 


Unnamed: 0,Movie Name,Review
17,The Matrix,"The Matrix...when I first heard about it, I ex..."
24,The Matrix,The Wachowski brothers really did excel themse...
45,The Matrix,It's been a while since a movie has generated ...
55,The Matrix,The Wachowski Brothers vision of a possible fu...
229,The Matrix,Right there with Seven and Silence of the Lamb...
1602,The Matrix,Is it possible that the whole world is simulat...
1603,The Matrix,Is it possible that the whole world is simulat...
2407,The Matrix,The Wachowski Brothers vision of a possible fu...
2597,The Matrix,The Wachowski brothers really did excel themse...
2766,The Matrix,It's been a while since a movie has generated ...


In [47]:
# Load the cleaned dataset
dataset_reviews = pd.read_csv('imdb_reviews_cleaned.csv')

# Check for duplicate reviews
duplicate_reviews = dataset_reviews.duplicated(subset=['Review'], keep=False)
print(f"Number of duplicate reviews: {duplicate_reviews.sum()}")

#  to see if there are duplicate reviews
if duplicate_reviews.sum() > 0:
    print("Duplicate Reviews:")
    print(dataset_reviews[duplicate_reviews])

# Count the number of unique reviews for each movie
unique_review_counts = dataset_reviews.groupby('Movie Name')['Review'].nunique()

# Print the count of unique reviews for each movie
print("\nUnique review counts per movie:")
print(unique_review_counts)

# Get the total number of reviews
total_reviews = dataset_reviews['Review'].count()
print(f"\nTotal number of reviews: {total_reviews}")


Number of duplicate reviews: 0

Unique review counts per movie:
Movie Name
Notting Hill           721
The Matrix            4388
The Notebook          1074
The Social Network     838
Name: Review, dtype: int64

Total number of reviews: 7021
