In [1]:
# Python Package imports
import requests
from bs4 import BeautifulSoup
from dateutil.parser import parse
import concurrent.futures
import pandas as pd
import movieposters as mp

In [2]:
# Maximum number of threads that will be spawned
MAX_THREADS = 50

In [3]:
#Declaring empty lists to be used
title = []
year = []
genres = []
synopsis =[]
poster  = []
ids = []
url = []
duration = []
voters = []
rating = []
certificate = []

In [4]:
# Creating functions

#function to get the movie title
def getMovieTitle(header):
    try:
        return header[0].find("a").getText()
    except:
        return 'NA'

#function to get the movie release year
def getReleaseYear(header):
    try:
        return header[0].find('span', class_='lister-item-year').text
    except:
        return 'NA'

#function to get the movie genres
def getGenre(muted_text):
    try:
        return muted_text.find("span",  {"class":  "genre"}).getText()
    except:
        return 'NA'

#function to get the movie synopsis/plot
def getsynopsys(movie):
    try:
        return movie.find_all("p", {"class":  "text-muted"})[1].getText()
    except:
        return 'NA'

#function to get the movie poster link
def getPoster(image):
    try:
        return image.get('loadlate')
        
    except:
        return 'NA'

#function to get the movie duration
def getDuration(header):
    try:
        return header.find('span', class_='runtime').text if header.p.find('span', class_='runtime') else '-'
    except:
        return 'NA'

#function to get the voters count
def getVoters(header):
    try:
        nv = header.find_all('span', attrs={'name':'nv'})
        return nv[0].text
    except:
        return 'NA'

#function to get the movie Rating
def getRating(header):
    try:
        return float(header.strong.text)
    except:
        return 'NA'
    
#function to get the movie certification
def getCertificate(muted_text):
    try:
        return muted_text.find("span",  {"class":  "certificate"}).getText()
    except:
        return 'NA'

In [5]:
#Main function, the main imdb url is passed as an arguement
def main(imdb_url):
    
    #use the imdb url, by using BeautifulSoup module
    response = requests.get(imdb_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Movie Name
    movies_list  = soup.find_all("div", {"class": "lister-item mode-advanced"})
    
    #traversing the movies
    for movie in movies_list:
        header = movie.find_all("h3", {"class":  "lister-item-header"})
        muted_text = movie.find_all("p", {"class":  "text-muted"})[0]
        imageDiv =  movie.find("div", {"class": "lister-item-image float-left"})
        image = imageDiv.find("img", "loadlate")
        
        #  Movie Title
        gt =  getMovieTitle(header)
        title.append(gt)
        
        #  Movie release year
        gy = getReleaseYear(header)
        year.append(gy)
        
        #  Genre  of movie
        gg = getGenre(muted_text)
        genres.append(gg)
        
        # Movie Synopsys
        gs = getsynopsys(movie)
        synopsis.append(gs)
        
        # Image attributes
        gp = getPoster(image)
        gpt = ""
        n = len(gp)
        for i in range(n):
            if(gp[i] == '@' and gp[i+1] == '@'):
                gpt = gpt + "@@.jpg"
                break
            elif(gp[i] == '@'):
                gpt = gpt + "@.jpg"
                break
            elif(gp[i] == '.' and gp[i+1] == '_'):
                gpt = gpt+".jpg"
                break
            else:
                gpt = gpt + gp[i]
                
        #Required custom dimensions of posters
        gpt = gpt[:-4] + "._V1_QL75_UX280_CR0,3,280,414_.jpg"
        poster.append(gpt)
        
        # IMDB url
        gi = image.get('data-tconst')
        ids.append(gi)
        url.append("https://www.imdb.com/title/" + gi + "/")
        
        # Movie Duration
        gd = getDuration(movie)
        duration.append(gd)
        
        # Movie Voters count
        gv = getVoters(movie)
        voters.append(gv)
        
        # Movie Rating
        gr = getRating(movie)
        rating.append(gr)
        
        # Movie Synopsys
        gc = getCertificate(muted_text)
        certificate.append(gc)

In [6]:
# An array to store all the URL that are being queried
imageArr = []

# Maximum number of pages one wants to iterate over
MAX_PAGE = 40

# Loop to generate all the URLS.
for i in range(0,MAX_PAGE):
    
    #since pages are more then 1
    totalRecords = 0 if i==0 else (250*i)+1
    
    #The required url passed here
    imdb_url = f'https://www.imdb.com/search/title/?release_date=1990-01-01,2023-12-31&user_rating=5.0,10.0&languages=en&adult=include&count=250&start=0{totalRecords}&ref_=adv_nxt'
    imageArr.append(imdb_url)

In [7]:
#FUnctions the manages everything through threads
def download_stories(story_urls):
    threads = min(MAX_THREADS, len(story_urls))
    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
        executor.map(main, story_urls)

In [8]:
# Call the download function with the array of URLS called imageArr
download_stories(imageArr)

# Attach all the data to the pandas dataframe. You can optionally write it to a CSV file as well
movieDf = pd.DataFrame({
    "Title": title,
    "Release_Year": year,
    "Genre": genres,
    "Synopsis": synopsis,
    "Poster_URL": poster,
    "Movie_ID": ids,
    "Movie_URL": url,
    "Duration": duration,
    "Certification": certificate,
    "Voters": voters,
    "Rating": rating
})

#To organize our dataframe 
movieDf['Genre'] = [x[1:] for x in movieDf['Genre']]
movieDf['Movie_ID'] = [x[2:] for x in movieDf['Movie_ID']]
movieDf['Synopsis'] = [x[1:] for x in movieDf['Synopsis']]
movieDf['Genre'] = movieDf['Genre'].str.replace(" ", "")

#Print the dataframe
print('--------- Complete CSV Formed --------')
display(movieDf)

#Save the dataframe
movieDf.to_csv('Data/new.csv', index=False)

--------- Complete CSV Formed --------


Unnamed: 0,Title,Release_Year,Genre,Synopsis,Poster_URL,Movie_ID,Movie_URL,Duration,Certification,Voters,Rating
0,John Wick: Chapter 2,(2017),"Action,Crime,Thriller",After returning to the criminal underworld to ...,https://m.media-amazon.com/images/M/MV5BMjE2ND...,4425200,https://www.imdb.com/title/tt4425200/,122 min,A,438276,7.4
1,FBI: Most Wanted,(2020– ),"Action,Crime,Drama",It follows the division of the FBI tasked with...,https://m.media-amazon.com/images/M/MV5BZWE5Zm...,9742936,https://www.imdb.com/title/tt9742936/,45 min,,8681,6.9
2,Heartland,(II) (2007– ),"Drama,Family","A multi-generational saga set in Alberta, Cana...",https://m.media-amazon.com/images/M/MV5BY2Q2Zj...,1094229,https://www.imdb.com/title/tt1094229/,45 min,16,17385,8.5
3,Don't Look Up,(2021),"Comedy,Drama,Sci-Fi",Two low-level astronomers must go on a giant m...,https://m.media-amazon.com/images/M/MV5BZjcwZj...,11286314,https://www.imdb.com/title/tt11286314/,138 min,18,541908,7.2
4,Bosch,(2014–2021),"Crime,Drama",An L.A.P.D. homicide detective works to solve ...,https://m.media-amazon.com/images/M/MV5BZDBiYj...,3502248,https://www.imdb.com/title/tt3502248/,51 min,18,73251,8.5
...,...,...,...,...,...,...,...,...,...,...,...
9995,Man with a Plan,(2016–2020),Comedy,A dad finds out that parenting is harder than ...,https://m.media-amazon.com/images/M/MV5BNjYwMz...,5536400,https://www.imdb.com/title/tt5536400/,30 min,All,10645,7.0
9996,Kung Pow: Enter the Fist,(2002),"Action,Comedy",A rough-around-the-edges martial arts master s...,https://m.media-amazon.com/images/M/MV5BMGQxZD...,0240468,https://www.imdb.com/title/tt0240468/,81 min,PG-13,46970,6.2
9997,Big Boys,(2022– ),"Comedy,Drama","Shy, closeted Jack has spent the past year at ...",https://m.media-amazon.com/images/M/MV5BOWVmZj...,13683866,https://www.imdb.com/title/tt13683866/,24 min,,2315,8.5
9998,The Invention of Lying,(2009),"Comedy,Fantasy,Romance",A comedy set in a world where no one has ever ...,https://m.media-amazon.com/images/M/MV5BMTU2OT...,1058017,https://www.imdb.com/title/tt1058017/,100 min,PG-13,142974,6.3
