In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

In [2]:
def most_popular_movies():
    url="https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm"
    res = requests.get(url)
    soup = BeautifulSoup(res.text)
    movies=soup.find_all("tr")
    most_pop_movies=[]
    for i in range(1,len(movies)):
        movie=movies[i]
        titleCol=movie.find_next("td",attrs={"class":"titleColumn"}).text.strip().split("\n")
        title=titleCol[0]
        year=(int)(titleCol[1][1:-1])
        rank=(int)(titleCol[2])
        image=movie.find_next("img").get("src")
        rating=(movie.find_next("td",attrs={"class":"ratingColumn"}).text.strip())
        if(rating!=""):
            rating=(float)(rating)
        else:
            rating=np.nan
        movie={
            "title":title,
            "year":year,
            "rank":rank,
            "IMDB Rating":rating,
            "image":image,
        }
        most_pop_movies.append(movie)
    return most_pop_movies

In [3]:
most_pop_movies=most_popular_movies()
df=pd.DataFrame(most_pop_movies)
df.to_csv("Most_Popular_Movies.csv",index=None)

In [4]:
def movies_by_genre(genre):
    url="https://www.imdb.com/search/title/?genres={}&explore=title_type,genres&ref_=adv_explore_rhs"
    res=requests.get(url.format(genre))
    soup=BeautifulSoup(res.text)
    movies=soup.find_all("div",attrs={"class":"lister-item"})
    movies_by_genre=[]
    for i in range(len(movies)):
        movie=movies[i]
        image=movie.find_next("img").get("loadlate")
        header=movie.find_next("h3",attrs="lister-item-header")
        title=header.find_next("a").text
        header.find_next("span",attrs="lister-item-year").text[1:-1]
        facts=movie.find_next("p",attrs="text-muted")
        runtime=facts.find_next("span",attrs="runtime").text.strip()
        genre=facts.find_next("span",attrs="genre").text.strip()
        certificate=facts.find_next("span",attrs="certificate")
        if(certificate):
            certificate=certificate.text.strip()
        rating=float(movie.find_next("div",attrs="ratings-imdb-rating").text.strip())
        metascore=movie.find_next("span",attrs="metascore")
        if(metascore):
            metascore=float(metascore.text.strip())
        description=movie.find_all("p",attrs="text-muted")[1].text.strip()
        stats=movie.find_next("p",attrs="sort-num_votes-visible").text.strip().split("\n")
        votes=stats[1]
        if len(stats)>3:
            gross=stats[3]
        else:
            gross=np.nan
        movie={
            "title":title,
            "image":image,
            "runtime":runtime,
            "genre":genre,
            "certificate":certificate,
            "rating":rating,
            "metascore":metascore,
            "description":description,
            "votes":votes,
            "gross":gross,
        }
        movies_by_genre.append(movie)
    return movies_by_genre

In [5]:
genres=["Musical","Family","Comedy","Drama","Romance"]
for i in genres:
    movies_by_genre_arr=movies_by_genre(i)
    df=pd.DataFrame(movies_by_genre_arr)
    df.to_csv("Movies_"+i+".csv",index=None)