In [4]:
import bs4
import pandas as pd
import numpy as np
import requests

In [7]:
url = 'https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating'
def get_page_contents(url):
    page = requests.get(url, headers={"Accept-Language": "en-US"})
    return bs4.BeautifulSoup(page.text, "html.parser")
soup = get_page_contents(url)

In [11]:
def numeric_value(movie, tag, class_=None, order=None):
    if order:
        if len(movie.findAll(tag, class_)) > 1:
            to_extract = movie.findAll(tag, class_)[order]['data-value']
        else:
            to_extract = None
    else:
        to_extract = movie.find(tag, class_)['data-value']

    return to_extract


def text_value(movie, tag, class_=None):
    if movie.find(tag, class_):
        return movie.find(tag, class_).text
    else:
        return


def nested_text_value(movie, tag_1, class_1, tag_2, class_2, order=None):
    if not order:
        return movie.find(tag_1, class_1).find(tag_2, class_2).text
    else:
        return [val.text for val in movie.find(tag_1, class_1).findAll(tag_2, class_2)[order]]


def extract_attribute(soup, tag_1, class_1='', tag_2='', class_2='',
                      text_attribute=True, order=None, nested=False):
    movies = soup.findAll('div', class_='lister-item-content')
    data_list = []
    for movie in movies:
        if text_attribute:
            if nested:
                data_list.append(nested_text_value(movie, tag_1, class_1, tag_2, class_2, order))
            else:
                data_list.append(text_value(movie, tag_1, class_1))
        else:
            data_list.append(numeric_value(movie, tag_1, class_1, order))

    return data_list


titles = extract_attribute(soup, 'a')
release = extract_attribute(soup, 'span', 'lister-item-year text-muted unbold')
audience_rating = extract_attribute(soup, 'span', 'certificate')
runtime = extract_attribute(soup, 'span', 'runtime')
genre = extract_attribute(soup, 'span', 'genre')
imdb_rating = extract_attribute(soup, 'div', 'inline-block ratings-imdb-rating', False)
votes = extract_attribute(soup, 'span' , {'name' : 'nv'}, False, 0)
earnings = extract_attribute(soup, 'span' , {'name' : 'nv'}, False, 1)
directors = extract_attribute(soup, 'p', '', 'a', '', True, 0, True)
actors = extract_attribute(soup, 'p', '', 'a', '', True, slice(1, 5, None), True)


df_dict = {'Title': titles, 'Relase': release, 'Audience Rating': audience_rating,
               'Runtime': runtime, 'Genre': genre, 'IMDB Rating': imdb_rating,
               'Votes': votes, 'Box Office Earnings': earnings, 'Director': directors,
               'Actors': actors}
df = pd.DataFrame(df_dict)
df

Unnamed: 0,Title,Relase,Audience Rating,Runtime,Genre,IMDB Rating,Votes,Box Office Earnings,Director,Actors
0,The Shawshank Redemption,(1994),R,142 min,\nDrama,\n\n9.3\n,2303612,2303612,Frank Darabont,"[Tim Robbins, Morgan Freeman, Bob Gunton, Will..."
1,The Godfather,(1972),R,175 min,"\nCrime, Drama",\n\n9.2\n,1589883,1589883,Francis Ford Coppola,"[Marlon Brando, Al Pacino, James Caan, Diane K..."
2,The Dark Knight,(2008),PG-13,152 min,"\nAction, Crime, Drama",\n\n9.0\n,2267182,2267182,Christopher Nolan,"[Christian Bale, Heath Ledger, Aaron Eckhart, ..."
3,The Godfather: Part II,(1974),R,202 min,"\nCrime, Drama",\n\n9.0\n,1110813,1110813,Francis Ford Coppola,"[Al Pacino, Robert De Niro, Robert Duvall, Dia..."
4,The Lord of the Rings: The Return of the King,(2003),PG-13,201 min,"\nAction, Adventure, Drama",\n\n8.9\n,1618608,1618608,Peter Jackson,"[Elijah Wood, Viggo Mortensen, Ian McKellen, O..."
...,...,...,...,...,...,...,...,...,...,...
95,Eternal Sunshine of the Spotless Mind,(2004),R,108 min,"\nDrama, Romance, Sci-Fi",\n\n8.3\n,898911,898911,Michel Gondry,"[Jim Carrey, Kate Winslet, Tom Wilkinson, Gerr..."
96,Amélie,(2001),R,122 min,"\nComedy, Romance",\n\n8.3\n,695590,695590,Jean-Pierre Jeunet,"[Audrey Tautou, Mathieu Kassovitz, Rufus, Lore..."
97,Snatch,(2000),R,102 min,"\nComedy, Crime",\n\n8.3\n,772486,772486,Guy Ritchie,"[Jason Statham, Brad Pitt, Benicio Del Toro, D..."
98,Requiem for a Dream,(2000),R,102 min,\nDrama,\n\n8.3\n,755830,755830,Darren Aronofsky,"[Ellen Burstyn, Jared Leto, Jennifer Connelly,..."


In [None]:
df

In [None]:
df['IMDB Rating'] = df['IMDB Rating'].str.replace('\n\n', '')

In [None]:
df['IMDB Rating'] = df['IMDB Rating'].str.replace('\n', '')

In [None]:
df['Genre'] = df['Genre'].str.replace('\n', '')

In [None]:
df

In [None]:
df.sort_values(by=['IMDB Rating'], ascending = False)

In [None]:
top_10_movies = df['Title'].value_counts()[0:10]
top_10_movies = list(top_10_movies.index)

In [None]:
top_10_movies

In [None]:
df.sort_values(by = "Genre")

In [None]:
cc = df[["Title", "Box Office Earnings"]].values.ravel()[0:11]
titles_with_high_earnings =  pd.unique(cc)
titles_with_high_earnings

In [None]:
cc2 = df[["Genre", "Box Office Earnings"]].values.ravel()[0:11]
titles_with_high_earnings =  pd.unique(cc2)
titles_with_high_earnings