In [2]:
# Url website = https://www.imdb.com/search/title/?groups=top_100

In this analysis, I am using BeautifulSoup and Requests to extract data without Selenium. I chose this approach because it is lighter and easier to use for my current needs.

However, Selenium is useful when a website loads data using JavaScript, as Requests cannot access data that is not in the original HTML. With Selenium, we can simulate user actions like scrolling and clicking, which helps in extracting data from websites that cannot be accessed directly with Requests.

Web scraping can also be done using APIs, which often provide cleaner and more structured data. However, I have no experience using APIs and have never tried them before

In [3]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import sklearn

import requests
from bs4 import BeautifulSoup

In [4]:
import requests

url = 'https://www.imdb.com/search/title/?groups=top_100'

# Header untuk meniru pelayar sebenar
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
}

imdb_response = requests.get(url, headers=headers)
print(imdb_response)


<Response [200]>


In [5]:
soup = BeautifulSoup(imdb_response.text, 'html.parser')

# Let's Scriping using:
- BeautifulSoup
- Requests
- Regex

In [58]:
titles = soup.find_all('h3', class_="ipc-title__text")
titles = [re.sub(r'^\d+\.\s*', '', title.text).strip() for title in titles]
len(titles)

26

In [29]:
year = soup.find_all('span',class_='dli-title-metadata-item')
years = [y.text for y in year if re.match(r'^\d{4}$', y.text)] 
len(years)

25

In [28]:
duration = soup.find_all('span',class_='dli-title-metadata-item')
durations = [d.text for d in duration if re.match(r'^\d+h \d+m$', d.text)]
len(durations)

25

In [27]:
rating = soup.find_all('span',class_='ipc-rating-star--rating')
ratings = [ratings.text for ratings in rating]
len(ratings)

25

In [34]:
metascore = soup.find_all('span',class_="sc-b0901df4-0 bXIOoL metacritic-score-box")
metascores = [m.text for m in metascore]
len(metascores)

25

In [40]:
synopsis = soup.find_all('div', class_="ipc-html-content-inner-div")
synopses = [s.text.strip() for s in synopsis]
len(synopses)


25

In [60]:
import pandas as pd
pd.set_option('display.max_colwidth', None) 

df = pd.DataFrame({
    'Title': titles[:-1],
    'Year': years,
    'Duration': durations,
    'Rating': ratings,
    'Metascore': metascores,
    'Synopsis': synopses
})

df.head()


Unnamed: 0,Title,Year,Duration,Rating,Metascore,Synopsis
0,Dune: Part Two,2024,2h 46m,8.5,79,"Paul Atreides unites with the Fremen while on a warpath of revenge against the conspirators who destroyed his family. Facing a choice between the love of his life and the fate of the universe, he endeavors to prevent a terrible future."
1,Interstellar,2014,2h 49m,8.7,74,"When Earth becomes uninhabitable in the future, a farmer and ex-NASA pilot, Joseph Cooper, is tasked to pilot a spacecraft, along with a team of researchers, to find a new planet for humans."
2,The Shawshank Redemption,1994,2h 22m,9.3,82,"A banker convicted of uxoricide forms a friendship over a quarter century with a hardened convict, while maintaining his innocence and trying to remain hopeful through simple compassion."
3,Gisaengchung,2019,2h 12m,8.5,97,Greed and class discrimination threaten the newly formed symbiotic relationship between the wealthy Park family and the destitute Kim clan.
4,The Godfather,1972,2h 55m,9.2,100,The aging patriarch of an organized crime dynasty transfers control of his clandestine empire to his reluctant son.


In [61]:
# Save the dataframe

df.to_csv('movies_data.csv', index=False) 