# Data Scraping from IMDB

In [2]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
from textblob import TextBlob
from requests import get
from time import sleep
from random import randint
from warnings import warn
import matplotlib.pyplot as plt

In [3]:
pages = np.arange(1, 20, 50) 
headers = {'Accept-Language': 'en-US,en;q=0.8'} # If this is not specified, the default language is Mandarin

# Initialize empty lists to store the variables scraped

In [5]:
titles = []
years = []
ratings = []
genres = []
runtimes = []
imdb_ratings = []
imdb_ratings_standardized = []
metascores = []
votes = []

In [6]:
for page in pages:
    
    #get request
    response = get("https://www.imdb.com/search/title?genres=sci-fi&" 
                   + "start=" 
                   + str(page) 
                   + "&explore=title_type,genres&ref_=adv_prv")
    
    sleep(randint(8,15))
     
    #throw warning for status codes that are not 200
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(requests, response.status_code))

    #parse the content of current iteration of request
    page_html = BeautifulSoup(response.text, 'html.parser')
        
    movie_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
    
    #extract the 50 movies for that page
    for container in movie_containers:

        #conditional for all with metascore
        if container.find('div', class_ = 'ratings-metascore') is not None:

            #title
            title = container.h3.a.text
            titles.append(title)

            #year released
            year = container.h3.find('span', class_= 'lister-item-year text-muted unbold').text
            years.append(year)

            #rating
            rating = container.p.find('span', class_= 'certificate')
            ratings.append(rating)

            #genre
            genre = container.p.find('span', class_ = 'genre').text
            genres.append(genre)

            #runtime
            time = container.p.find('span', class_ = 'runtime').text
            runtimes.append(time)

            #IMDB ratings
            imdb = float(container.strong.text)
            imdb_ratings.append(imdb)

            #Metascore
            m_score = container.find('span', class_ = 'metascore').text
            metascores.append(int(m_score))

            #Number of votes
            vote = container.find('span', attrs = {'name':'nv'})['data-value']
            votes.append(int(vote))

In [7]:
sci_fi_df = pd.DataFrame({'movie': titles,
                      'year': years,
                      'rating': ratings,
                      'genre': genres,
                      'runtime_min': runtimes,
                      'imdb': imdb_ratings,
                      'metascore': metascores,
                      'votes': votes}
                      )

sci_fi_df.loc[:, 'year'] = sci_fi_df['year'].str[-5:-1] # two more data transformations after scraping
# Drop 'ovie' bug
# Make year an int
sci_fi_df['n_imdb'] = sci_fi_df['imdb'] * 10
final_df = sci_fi_df.loc[sci_fi_df['year'] != 'ovie'] # One small issue with the scrape on these two movies so just dropping those ones.
final_df.loc[:, 'year'] = pd.to_numeric(final_df['year'])

In [8]:
sci_fi_df

Unnamed: 0,movie,year,rating,genre,runtime_min,imdb,metascore,votes,n_imdb
0,Avatar: O Caminho da Água,2022,[14],"\nAction, Adventure, Fantasy",192 min,7.8,67,275274,78.0
1,Tudo em Todo Lugar ao Mesmo Tempo,2022,[14],"\nAction, Adventure, Comedy",139 min,8.0,81,311194,80.0
2,M3gan,2022,[14],"\nHorror, Sci-Fi, Thriller",102 min,6.4,72,47788,64.0
3,Infinity Pool,2023,[R],"\nCrime, Horror, Mystery",117 min,6.9,72,2764,69.0
4,Jung_E,2023,[16],"\nAction, Adventure, Drama",98 min,5.4,53,6142,54.0
5,Avatar,2009,[12],"\nAction, Adventure, Fantasy",162 min,7.9,83,1315886,79.0
6,The Wandering Earth 2,2023,,"\nAction, Adventure, Drama",173 min,8.0,56,5233,80.0
7,Pantera Negra: Wakanda para Sempre,2022,[12],"\nAction, Adventure, Drama",161 min,7.1,67,152847,71.0
8,Vesper,2022,,"\nAdventure, Drama, Sci-Fi",114 min,5.9,70,14504,59.0
9,Adão Negro,2022,[14],"\nAction, Adventure, Fantasy",125 min,6.4,41,217010,64.0


# Reviews

In [10]:
page = requests.get("https://www.imdb.com/title/tt1630029/reviews/?ref_=tt_ql_urv")
soup = BeautifulSoup(page.content, "html.parser")
rev = soup.findAll('div', class_='text show-more__control')
movies = []
for movie in rev:
    movies.append(movie.text)
df_subset = pd.DataFrame()
df_subset['Avatar: O Caminho da Água']=movies

In [11]:
df_subset

Unnamed: 0,Avatar: O Caminho da Água
0,The strong suit of the first Avatar movie was ...
1,This sequel has a design and look that matches...
2,"It's a James Cameron film, so it's impressive...."
3,"So, the dumbest thing they could have possibly..."
4,James Cameron brings the biggest disappointmen...
5,It's hard to believe the sequel to Avatar has ...
6,"Technically gorgeous, but the story is similar..."
7,"Without question, this has the best CG works I..."
8,Massive advertising will make sure (just like ...
9,He did it again. And I don't even really under...


# Sentiments

In [13]:
df = pd.DataFrame()

In [14]:
df['Reviews of people']=movies

In [15]:
list=[]
for i in range(len(movies)):
    text=TextBlob(movies[i])
    x=text.sentiment.polarity # x between -1 and 1
    list.append(round(x,2))

In [16]:
sent=[]
j=[]
for j in list:
    if j>0:
        x="positive"
    elif j==0:
        x="neutre"
    else :
        x="negative"
    sent.append(x) 

In [17]:
df['sentiment']=sent

In [84]:
df  

Unnamed: 0,Reviews of people,sentiment
0,The strong suit of the first Avatar movie was ...,positive
1,This sequel has a design and look that matches...,positive
2,"It's a James Cameron film, so it's impressive....",positive
3,"So, the dumbest thing they could have possibly...",positive
4,James Cameron brings the biggest disappointmen...,positive
5,It's hard to believe the sequel to Avatar has ...,positive
6,"Technically gorgeous, but the story is similar...",positive
7,"Without question, this has the best CG works I...",positive
8,Massive advertising will make sure (just like ...,positive
9,He did it again. And I don't even really under...,positive
