In [10]:
import numpy as np
import pandas as pd
from scrapy.selector import Selector
from selenium import webdriver 
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import time
from tqdm import tqdm
import warnings
import re
warnings.filterwarnings("ignore")

In [11]:
# Instantiate the data required in lists
users_list = ['ur117926588', 'ur3608743']
user_id = []
movie_title_list = []
year_list = []
review_title_list = []
review_list = []
error_msg = []

In [12]:
for user in range(len(users_list)):
    print("Scraping for User {}".format(user+1))
#     driver = webdriver.Chrome('chromedriver.exe')
    driver = webdriver.Chrome(ChromeDriverManager().install())
    url = 'https://www.imdb.com/user/{}/reviews'.format(users_list[user])
    time.sleep(1)
    driver.get(url)
    time.sleep(1)
    
    sel = Selector(text = driver.page_source)
    num_of_reviews = sel.css(".header span::text").extract_first().replace(',','').split(' ')[0]
    more_review_pages = int(int(num_of_reviews)/25)
    
    user_id += [users_list[user] for i in range(int(num_of_reviews))]
    
    # Loading all the reviews in the single page before scraping
    for i in tqdm(range(more_review_pages)):
        try:
            css_selector = 'load-more-trigger'
            driver.find_element(By.ID, css_selector).click()
            time.sleep(2)
        except:
            pass
    
    reviews = driver.find_elements(By.CSS_SELECTOR, 'div.review-container')
    for review in tqdm(reviews):
        try:
            sel2 = Selector(text = review.get_attribute('innerHTML'))
            try:
                movie_title = sel2.css('.lister-item-header a::text').extract_first().strip()
            except:
                movie_title = np.NaN
            try:
                year = sel2.css('.lister-item-year.text-muted.unbold::text').extract_first().strip().replace('(','').replace(')','')
                year = re.sub(r'[a-zA-Z\s]+', '', year)
            except:
                year = np.NaN
            try:
                review_title = sel2.css('.title::text').extract_first().strip()
            except:
                review_title = np.NaN
            try:
                review = sel2.css('.text.show-more__control::text').extract_first()
            except:
                review = np.NaN
                
            movie_title_list.append(movie_title)
            year_list.append(year)
            review_title_list.append(review_title)
            review_list.append(review)
        except Exception as e:
            error_msg.append(e)

Scraping for User 1


100%|██████████████████████████████████████████████████████████████████████████████████| 49/49 [05:09<00:00,  6.31s/it]
100%|██████████████████████████████████████████████████████████████████████████████| 1244/1244 [00:24<00:00, 50.07it/s]


Scraping for User 2


100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [02:31<00:00,  4.34s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 888/888 [00:25<00:00, 35.16it/s]


In [23]:
# Storing all data in dataframe
reviews_df = pd.DataFrame({
    "UserID": user_id,
    "Title": movie_title_list,
    "Year": year_list,
    "Review_Title": review_title_list,
    "Review": review_list})

In [24]:
# Export dataset
reviews_df.to_csv(path_or_buf = "users_reviews.csv"
                          , index = False, encoding='utf-8-sig')

In [28]:
# merge ratings and reviews together using the userID, movie title, and year
users_ratings = pd.read_csv("users_ratings.csv")
users_reviews = pd.read_csv("users_reviews.csv")
df = users_ratings.merge(users_reviews, left_on=['UserID', 'Title', 'Year'], right_on=['UserID', 'Title', 'Year'])
df

Unnamed: 0,UserID,Title,Description,Year,Directors,Stars,Duration,Genre,Movie_Rating,User_Rating,Review_Title,Review
0,ur117926588,The Magician's Elephant,An orphaned boy is told by a fortune teller th...,2023,['Wendy Rogers'],"['Noah Jupe', 'Mandy Patinkin', 'Brian Tyree H...",1 hr 39 min,"Animation, Adventure, Comedy",6.6,8,"Filled with imagination, wonder, and charm, Th...","Set in the once magical city of Baltese, the c..."
1,ur117926588,Cocaine Bear,"An oddball group of cops, criminals, tourists ...",2023,['Elizabeth Banks'],"['Keri Russell', 'Alden Ehrenreich', ""O'Shea J...",1 hr 35 min,"Comedy, Thriller",6.2,7,A mixture of Pineapple Express by way of Grizz...,"Set in 1985, following a failed drug run by An..."
2,ur117926588,The Road to El Dorado,Two swindlers get their hands on a map to the ...,2000,"['Bibo Bergeron', 'Don Paul', 'Jeffrey Katzenb...","['Kevin Kline', 'Kenneth Branagh', 'Rosie Pere...",1 hr 29 min,"Animation, Adventure, Comedy",6.9,7,Dreamworks' sophomore film may not be as grand...,"Set in Spain in 1519, two con artists consisti..."
3,ur117926588,Baby's Day Out,After three kidnappers lose the baby they have...,1994,['Patrick Read Johnson'],"['Lara Flynn Boyle', 'Joe Mantegna', 'Joe Pant...",1 hr 39 min,"Adventure, Comedy, Crime",6.2,6,Probably one of John Hughes' better post Home ...,"Bennington Austin Cotwell IV also known as ""Ba..."
4,ur117926588,Basic,A D.E.A. Agent investigates the disappearance ...,2003,['John McTiernan'],"['John Travolta', 'Samuel L. Jackson', 'Connie...",1 hr 38 min,"Action, Crime, Drama",6.4,7,A twisty and labyrinthian thriller that keeps ...,"At a U. S. military base in Panama, a group of..."
...,...,...,...,...,...,...,...,...,...,...,...,...
2106,ur3608743,Elizabethtown,During a hometown memorial for his Kentucky-bo...,2005,['Cameron Crowe'],"['Orlando Bloom', 'Kirsten Dunst', 'Susan Sara...",2 hr 3 min,"Comedy, Drama, Romance",6.3,8,Crowe's Kentucky Romance and Americana Road Tr...,Critics have been lambasting director-screenwr...
2107,ur3608743,Cinderella Man,"The story of James J. Braddock, a supposedly w...",2005,['Ron Howard'],"['Russell Crowe', 'Renée Zellweger', 'Craig Bi...",2 hr 24 min,"Biography, Drama, Romance",8.0,8,Old-Fashioned Story Strong on Historical Conte...,"As he proved with 1995's ""Apollo 13"", director..."
2108,ur3608743,The Umbrellas of Cherbourg,A young woman separated from her lover by war ...,1964,['Jacques Demy'],"['Catherine Deneuve', 'Nino Castelnuovo', 'Ann...",1 hr 31 min,"Drama, Musical, Romance",7.8,7,Bold Series of Uninterrupted Recitatives Const...,"In 1964, filmmaker Jacques Demy made an audaci..."
2109,ur3608743,March of the Penguins,"In the Antarctic, every March since the beginn...",2005,['Luc Jacquet'],"['Morgan Freeman', 'Romane Bohringer', 'Charle...",1 hr 20 min,"Documentary, Family",7.5,9,"Awe-Inspiring, Intimate Look at the Succession...","From ""Mary Poppins"" to ""Madagascar"", animated ..."
