In [5]:
import numpy as np
import pandas as pd
from scrapy.selector import Selector
from selenium import webdriver 
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time
from tqdm import tqdm
import warnings
import re
warnings.filterwarnings("ignore")

In [6]:
# Instantiate the data required in lists
users_list = ['ur117926588', 'ur3608743']
user_id = []
title_list = []
description_list = []
year_list = []
director_list = []
star_list = []
duration_list = []
advisory_list = []
genre_list = []
vote_list = []
movie_rating_list = []
user_rating_list = []
error_msg = []

In [7]:
for user in range(len(users_list)):
    print("Scraping for User {}".format(user+1))
    driver = webdriver.Chrome(ChromeDriverManager().install())
    url = 'https://www.imdb.com/user/{}/ratings'.format(users_list[user])
    time.sleep(1)
    driver.get(url)
    time.sleep(1)
    
    sel = Selector(text = driver.page_source)
    num_of_ratings = sel.css(".lister-list-length span::text").extract_first().replace(',','').split(' ')[0]
    rating_pages = int(int(num_of_ratings)/100) + 1  
    user_id += [users_list[user] for i in range(int(num_of_ratings))]

    for x in range(rating_pages):
        sel = Selector(text = driver.page_source)
        reviews = driver.find_elements(By.CSS_SELECTOR, 'div.lister-item-content')
        
        for review in tqdm(reviews):
            try:
                sel2 = Selector(text = review.get_attribute('innerHTML'))
                
                ## Get movie title
                try:
                    title = sel2.css('a::text').extract_first().strip()
                    episode = sel2.css('a::text').getall()[1].strip()
                    if episode != "":
                        title += (" - " + episode)
                except:
                    title= np.NaN
                ## Get movie description
                try:
                    advisory = sel2.css('.certificate::text').extract_first()
                    duration = sel2.css('.runtime::text').extract_first()
                    description = None
                    if advisory == None and duration == None:
                        description = sel2.css('p::text').getall()[3].strip()
                    elif advisory == None or duration == None:
                        description = sel2.css('p::text').getall()[5].strip()
                    else:
                        description = sel2.css('p::text').getall()[7].strip()
                except:
                    description = np.NaN
                ## Get movie year
                try:
                    year = sel2.css('.lister-item-year.text-muted.unbold::text').extract_first().strip().replace('(','').replace(')','')
                    year = re.sub(r'[a-zA-Z\s]+', '', year)
                except:
                    year = np.NaN
                ## Get directors and staff
                try:
                    staff = sel2.css('.text-muted a::text').getall()
                    text = sel2.css('.text-muted.text-small::text').getall()
                    text2 = [x.strip() for x in text]
                    commas = text2.count(',')
                    stars_index = text2.index("Stars:")
                    count = 0
                    for i in range(stars_index, len(text2)-1):
                        if text2[i] == ',':
                            count+=1
                    stars = staff[-(count+1):]
                    # if directors are recorded
                    if "Director:" in text2 or "Directors:" in text2:
                        directors = staff[:(commas-count)+1]
                    else:
                        directors = ""
                except:
                    stars = np.NaN
                    directors = np.NaN
                ## Get movie duration
                try:
                    duration = duration.strip()
                except:
                    duration = np.NaN
                ## Get viewer advisory
                try:
                    advisory = advisory.strip()
                except:
                    advisory = np.NaN
                ## Get Genre
                try:
                    genre = sel2.css('.genre::text').extract_first().strip()
                except:
                    genre = np.NaN
                ## Get votes
                try:
                    votes = sel2.css('.text-muted.text-small span::text').getall()[-1]
                    votes = int(votes.replace(',','').split(' ')[0])
                except:
                    votes = np.NaN
                ## Get movie rating
                try:
                    movie_rating = sel2.css('.ipl-rating-star__rating::text').getall()[0]
                    movie_rating = float(movie_rating.replace(',','').split(' ')[0])
                except:
                    movie_rating = np.NaN
                ## Get user rating
                try:
                    user_rating = sel2.css('.ipl-rating-star__rating::text').getall()[1]
                    user_rating = int(user_rating.replace(',','').split(' ')[0])
                except:
                    user_rating = np.NaN
                
                title_list.append(title)
                description_list.append(description)
                year_list.append(year)
                director_list.append(directors)
                star_list.append(stars)
                duration_list.append(duration)
                advisory_list.append(advisory)
                genre_list.append(genre)
                vote_list.append(votes)
                movie_rating_list.append(movie_rating)
                user_rating_list.append(user_rating)
                
            except Exception as e:
                error_msg.append(e)
        try:
            next_page_url = sel.css("a.flat-button.lister-page-next.next-page::attr(href)").extract_first()
            driver.get("https://www.imdb.com" + next_page_url)
        except:
            print("No more pages to browse")

Scraping for User 1


100%|██████████| 100/100 [00:03<00:00, 27.33it/s]
100%|██████████| 100/100 [00:01<00:00, 59.65it/s]
100%|██████████| 100/100 [00:01<00:00, 52.71it/s]
100%|██████████| 100/100 [00:01<00:00, 58.64it/s]
100%|██████████| 100/100 [00:01<00:00, 57.52it/s]
100%|██████████| 100/100 [00:01<00:00, 54.89it/s]
100%|██████████| 100/100 [00:01<00:00, 58.35it/s]
100%|██████████| 100/100 [00:01<00:00, 60.57it/s]
100%|██████████| 100/100 [00:01<00:00, 56.70it/s]
100%|██████████| 100/100 [00:01<00:00, 55.38it/s]
100%|██████████| 100/100 [00:01<00:00, 58.72it/s]
100%|██████████| 100/100 [00:01<00:00, 58.95it/s]
100%|██████████| 100/100 [00:01<00:00, 55.31it/s]
100%|██████████| 100/100 [00:01<00:00, 58.55it/s]
100%|██████████| 100/100 [00:01<00:00, 57.83it/s]
100%|██████████| 100/100 [00:01<00:00, 58.12it/s]
100%|██████████| 100/100 [00:01<00:00, 54.21it/s]
100%|██████████| 100/100 [00:01<00:00, 58.84it/s]
100%|██████████| 100/100 [00:01<00:00, 57.05it/s]
100%|██████████| 100/100 [00:01<00:00, 59.44it/s]


No more pages to browse
Scraping for User 2


100%|██████████| 100/100 [00:03<00:00, 25.73it/s]
100%|██████████| 100/100 [00:01<00:00, 52.54it/s]
100%|██████████| 100/100 [00:01<00:00, 51.50it/s]
100%|██████████| 100/100 [00:01<00:00, 58.12it/s]
100%|██████████| 100/100 [00:01<00:00, 57.23it/s]
100%|██████████| 100/100 [00:01<00:00, 50.71it/s]
100%|██████████| 100/100 [00:01<00:00, 56.46it/s]
100%|██████████| 100/100 [00:01<00:00, 54.51it/s]
100%|██████████| 100/100 [00:01<00:00, 58.39it/s]
100%|██████████| 1/1 [00:00<00:00, 55.36it/s]

No more pages to browse





In [8]:
# Storing all data in dataframe
rating_df = pd.DataFrame({
     "UserID": user_id,
     "Title":title_list,
     "Year":year_list,
     "Description":description_list,
     "Directors":director_list,
     "Stars": star_list,
     "Viewer_Advisory": advisory_list,
     "Duration": duration_list,
     "Genre": genre_list,
     "Votes": vote_list, 
     "Movie_Rating": movie_rating_list,
     "User_Rating": user_rating_list})

In [9]:
# Export dataset
rating_df.to_csv(path_or_buf = "users_ratings.csv"
                          , index = False, encoding='utf-8-sig')