In [1]:
# Python Package imports
import requests
from bs4 import BeautifulSoup
from dateutil.parser import parse
import concurrent.futures
import pandas as pd
import movieposters as mp

In [2]:
# Maximum number of threads that will be spawned
MAX_THREADS = 50

In [3]:
title = []
year = []
genres = []
synopsis =[]
poster  = []
ids = []
url = []
duration = []
voters = []
rating = []
certificate = []

In [4]:
def getMovieTitle(header):
    try:
        return header[0].find("a").getText()
    except:
        return 'NA'

def getReleaseYear(header):
    try:
        return header[0].find('span', class_='lister-item-year').text
    except:
        return 'NA'

def getGenre(muted_text):
    try:
        return muted_text.find("span",  {"class":  "genre"}).getText()
    except:
        return 'NA'

def getsynopsys(movie):
    try:
        return movie.find_all("p", {"class":  "text-muted"})[1].getText()
    except:
        return 'NA'

def getPoster(image):
    try:
        return image.get('loadlate')
        
    except:
        return 'NA'

def getDuration(header):
    try:
        return header.find('span', class_='runtime').text if header.p.find('span', class_='runtime') else '-'
    except:
        return 'NA'
    
def getVoters(header):
    try:
        nv = header.find_all('span', attrs={'name':'nv'})
        return nv[0].text
    except:
        return 'NA'
    
def getRating(header):
    try:
        return float(header.strong.text)
    except:
        return 'NA'
    
def getCertificate(muted_text):
    try:
        return muted_text.find("span",  {"class":  "certificate"}).getText()
    except:
        return 'NA'

In [5]:
def main(imdb_url):
    response = requests.get(imdb_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Movie Name
    movies_list  = soup.find_all("div", {"class": "lister-item mode-advanced"})
    
    for movie in movies_list:
        header = movie.find_all("h3", {"class":  "lister-item-header"})
        muted_text = movie.find_all("p", {"class":  "text-muted"})[0]
        imageDiv =  movie.find("div", {"class": "lister-item-image float-left"})
        image = imageDiv.find("img", "loadlate")
        
        #  Movie Title
        gt =  getMovieTitle(header)
        title.append(gt)
        
        #  Movie release year
        gy = getReleaseYear(header)
        year.append(gy)
        
        #  Genre  of movie
        gg = getGenre(muted_text)
        genres.append(gg)
        
        # Movie Synopsys
        gs = getsynopsys(movie)
        synopsis.append(gs)
        
        #  Image attributes
        gp = getPoster(image)
        
        poster.append(gp)
        
        gi = image.get('data-tconst')
        ids.append(gi)
        url.append("https://www.imdb.com/title/" + gi + "/")
        
        gd = getDuration(movie)
        duration.append(gd)
        
        gv = getVoters(movie)
        voters.append(gv)
        
        gr = getRating(movie)
        rating.append(gr)
        
        gc = getCertificate(muted_text)
        certificate.append(gc)

In [6]:
# An array to store all the URL that are being queried
imageArr = []

# Maximum number of pages one wants to iterate over
MAX_PAGE = 40
# Loop to generate all the URLS.
for i in range(0,MAX_PAGE):
    totalRecords = 0 if i==0 else (250*i)+1
    imdb_url = f'https://www.imdb.com/search/title/?release_date=1990-01-01,2023-12-31&user_rating=5.0,10.0&languages=en&adult=include&count=250&start=0{totalRecords}&ref_=adv_nxt'
    imageArr.append(imdb_url)

In [7]:
def download_stories(story_urls):
    threads = min(MAX_THREADS, len(story_urls))
    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
        executor.map(main, story_urls)

In [8]:
# Call the download function with the array of URLS called imageArr
download_stories(imageArr)

# Attach all the data to the pandas dataframe. You can optionally write it to a CSV file as well
movieDf = pd.DataFrame({
    "Title": title,
    "Release_Year": year,
    "Genre": genres,
    "Synopsis": synopsis,
    "Poster_URL": poster,
    "Movie_ID": ids,
    "Movie_URL": url,
    "Duration": duration,
    "Certification": certificate,
    "Voters": voters,
    "Rating": rating
})

movieDf['Genre'] = [x[1:] for x in movieDf['Genre']]
movieDf['Movie_ID'] = [x[2:] for x in movieDf['Movie_ID']]
movieDf['Synopsis'] = [x[1:] for x in movieDf['Synopsis']]
movieDf['Genre'] = movieDf['Genre'].str.replace(" ", "")

print('--------- Complete CSV Formed --------')
display(movieDf)

movieDf.to_csv('new.csv', index=False)

--------- Complete CSV Formed --------


Unnamed: 0,Title,Release_Year,Genre,Synopsis,Poster_URL,Movie_ID,Movie_URL,Duration,Certification,Voters,Rating
0,Stowaway,(I) (2021),"Adventure,Sci-Fi,Thriller",A three-person crew on a mission to Mars faces...,https://m.media-amazon.com/images/M/MV5BMWYwMW...,9203694,https://www.imdb.com/title/tt9203694/,116 min,13,47561,5.6
1,Anna Karenina,(I) (2012),"Drama,Romance","In late-19th-century Russian high society, St....",https://m.media-amazon.com/images/M/MV5BMTU0ND...,1781769,https://www.imdb.com/title/tt1781769/,129 min,18,101744,6.6
2,3rd Rock from the Sun,(1996–2001),"Comedy,Family,Sci-Fi","A group of aliens are sent to Earth, disguised...",https://m.media-amazon.com/images/M/MV5BMGMwZW...,0115082,https://www.imdb.com/title/tt0115082/,60 min,,53126,7.8
3,Toy Story 4,(2019),"Animation,Adventure,Comedy","When a new toy called ""Forky"" joins Woody and ...",https://m.media-amazon.com/images/M/MV5BMTYzMD...,1979376,https://www.imdb.com/title/tt1979376/,100 min,U,254387,7.7
4,Archive 81,(2022),"Drama,Horror,Mystery",An archivist hired to restore a collection of ...,https://m.media-amazon.com/images/M/MV5BZWRkYT...,13365348,https://www.imdb.com/title/tt13365348/,60 min,18,51350,7.3
...,...,...,...,...,...,...,...,...,...,...,...
9995,Another 48 Hrs.,(1990),"Action,Comedy,Crime",Jack Cates once again enlists the aid of ex-co...,https://m.media-amazon.com/images/M/MV5BNTAzOT...,0099044,https://www.imdb.com/title/tt0099044/,95 min,UA,42523,5.9
9996,The Comeback Trail,(2020),"Comedy,Crime",Two movie producers who owe money to the mob s...,https://m.media-amazon.com/images/M/MV5BMDg5OT...,5420210,https://www.imdb.com/title/tt5420210/,104 min,R,9654,5.7
9997,Wellington Paranormal,(2018–2022),"Comedy,Crime,Fantasy","Sergeant Maaka, and Officers Minogue and O'Lea...",https://m.media-amazon.com/images/M/MV5BNDQ1ZG...,6109562,https://www.imdb.com/title/tt6109562/,30 min,,6421,7.6
9998,Forensic Files,(1996–2011),"Documentary,Crime",A series featuring detailed accounts on how no...,https://m.media-amazon.com/images/M/MV5BODRmMz...,0247882,https://www.imdb.com/title/tt0247882/,30 min,,6615,8.8


In [9]:
movieDf["Movie_ID"].value_counts()

9203694     1
1229340     1
20601972    1
2375720     1
0787475     1
           ..
12121582    1
6146586     1
13784584    1
8855960     1
0338096     1
Name: Movie_ID, Length: 10000, dtype: int64

In [10]:
df = movieDf.loc[movieDf['Movie_ID'] == "18335752"]
df

Unnamed: 0,Title,Release_Year,Genre,Synopsis,Poster_URL,Movie_ID,Movie_URL,Duration,Certification,Voters,Rating
3794,Puss in Boots: The Last Wish,(2022),"Comedy,Crime,Drama",The Duttons face a new set of challenges in th...,https://m.media-amazon.com/images/M/MV5BZWZhNG...,18335752,https://www.imdb.com/title/tt18335752/,373 min,R,47670,6.3
