In [1]:
# For measuring time
import time

# For applying a random sleep interval between requests  
from random import randint 
from time import sleep

# For interracting with webpages
import requests

# Working with files
import pandas as pd
from pandas import ExcelWriter

# Core of scraping
from bs4 import BeautifulSoup

# Need these in order to simulate human activity in Chrome browser (clicking)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec

# Interracting with Operating System
import os

# Regex
import re

# Current directory for working with files (saving/loading)
os.chdir(r"C:\Users\aleen\Desktop\Data Science\Text Analytics\Exam Project\Data\IMDB")
chdir = os.getcwd()  

# Input variables
url_add = str(input("Please enter URL, hit Space and then Enter:\n"))
filename = str(input("Please name your output .csv file:\n"))

# Runtime starting time
start_time = time.time()

# Calculating the maximum number of 'Load More' button clicks 
while True:
    try:
        source_code = requests.get(url_add, timeout = 30, verify=False)
    except (requests.ConnectionError) as e:
        print("OOPS!! Connection Error. Make sure you are connected to Internet. Technical Details given below.\n")
        print(str(e))            
        continue
    except (requests.Timeout) as e:
        print("OOPS!! Timeout Error")
        print(str(e))
        continue
    except (requests.RequestException) as e:
        print("OOPS!! General Error")
        print(str(e))
        continue
    except (KeyboardInterrupt):
        print("Someone closed the program")
    break
    
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')

for number_of_reviews in soup.findAll('div', {'class': 'header'}):
    c = number_of_reviews.text
    s = c.split()
    s = str(s[0])
    s = int(s.replace(",", ""))
    break

maxclicks = s//25
#maxclicks = 4
print('maxclicks='+str(maxclicks))

# Initializing webdriver for Chrome browser 
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 100)
driver.get(url_add)

# Click until there are no more reviews to load
clicks = 0
while True:
    clicks += 1
    if clicks <= maxclicks:
        more_button = wait.until(ec.visibility_of_element_located((By.CLASS_NAME, "ipl-load-more__button"))).click()
        sleep(randint(3,12))
    else:
        break
    # Sleep so we don't get IP banned. Experiment with differet sleep patterns. 
    
    print(str(clicks) + "click")
    
# Driver.find_element_by_class_name("ipl-load-more__button").click()
source_code = driver.page_source

plain_text = source_code
soup = BeautifulSoup(plain_text, 'html.parser')

# This list will store the results. 
imdb = []

title_missing = []
review_missing = []
rating_missing = []
index = 0

for i in soup.findAll('div', attrs = {'class': 'imdb-user-review'}):
    
    print('\n' + str(index))
    
    if i.find('a', attrs = {'class': 'title'}) == None:
        title_missing.append(index)
        title_string = 'NaN'
    else: 
        title_string = i.find('a', attrs = {'class': 'title'}).get_text()
        title_string = re.sub('\n', '',title_string)
        
        
    if i.find('div', attrs = {'class': 'text show-more__control'}) == None:
        review_missing.append(index)
        review_string = 'NaN'
    else: 
        review_string = i.find('div', attrs = {'class': 'text show-more__control'}).text

        
    if i.find('span', attrs = {'class': 'rating-other-user-rating'}) == None:
        rating_missing.append(index)
        rating_string = 'NaN'
    else:
        rating_string = i.find('span', attrs = {'class': 'rating-other-user-rating'}).get_text() # ratings
        rating_string = re.sub('\n', '',rating_string)
        rating_string = re.sub('/10','',rating_string)
        
    data = {"review_title": title_string, "review_rating": rating_string, "review_text": review_string}
    imdb.append(data)
    
    print('\n')
    print(data)
    index += 1


# At the end of the loop, print:
print('\nOUT OF LOOP')

# Transform the list into a dataframe object so we can easily manipulate it. 
df = pd.DataFrame.from_dict(imdb)
#print(df)

# Store the results into a csv file so we can easily acces later. 
df.to_csv(filename, encoding='utf-8')

# Let's see how long it actually took to scrape the data. 
elapsed_time = time.time() - start_time
print('\nElapsed time: ' + str(elapsed_time) + ' seconds for scraping ' + str(len(df)) + ' reviews.\n')
print('\nTitle missing at index: ', title_missing)
print('Rating missing at index: ', rating_missing)
print('Review missing at index: ', review_missing)

Please enter URL, hit Space and then Enter:
https://www.imdb.com/title/tt6027908/reviews?ref_=tt_urv 
Please name your output .csv file:
Episode2




maxclicks=21
1click
2click
3click
4click
5click
6click
7click
8click
9click
10click
11click
12click
13click
14click
15click
16click
17click
18click
19click
20click
21click

0


{'review_title': ' Goodbye, friends', 'review_rating': '10', 'review_text': "Just an incredible penultimate episode. Every character had a moment. So much rich & incredible interaction. This is goodbye to a lot of characters tonight. It's devastating. Next weekend will be like a funeral. Im terrified. And not ready. Be careful what you wish for when you want non stop action. Ep 3 will be just that and everything you've loved about this show will turn to ash in your mouth. You'll want to come back to this episode after it all just to see their faces."}

1


{'review_title': ' A great episode showing each character in what is potentially their last moments', 'review_rating': '10', 'review_text': "To everyone saying that this episode is a boring filler episode, you need to understand that it is a lot of characters'

{'review_title': ' Game of Set Ups', 'review_rating': '10', 'review_text': 'How did we get here?? All of the main characters (save Cercei) in Winterfell? Our faves giving each other advice / looks / knighthoods? We are going into the next episode with all the right things in place!!'}

316


{'review_title': ' Got', 'review_rating': '10', 'review_text': 'A good tie the sack episode, alot of slow moments but funny and witty.'}

317


{'review_title': ' The calm before the storm!', 'review_rating': '10', 'review_text': 'If you check the PG of this episode you\'d probably realize how calm this episode must\'ve been! These were probably the last happy moments for many major characters and writers\' portrayed it really well. The particular scene were Jaime honors Brienne with the Title of a knight absolutely stole the show (That\'s why they name the episode after it duh). Jon finally told Danny and it she took it in a way which we thought she would! Our little Arya is now a grown woman but 


Elapsed time: 170.2392349243164 seconds for scraping 538 reviews.


Title missing at index:  []
Rating missing at index:  [60, 105, 201, 225, 236, 238, 249, 320, 344, 395, 451, 491]
Review missing at index:  [17, 19, 20, 35, 48, 62, 71, 75, 78, 90, 127, 174, 189, 204, 211, 228, 239, 260, 305, 366, 391, 395, 436, 448, 536]


In [2]:
df.shape

(3885, 3)