In [353]:
import requests
from bs4 import BeautifulSoup
import urllib
import time
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [354]:
def get_actors_by_movie_soup(cast_page_soup, num_of_actors_limit=None):
    
    url='https://www.imdb.com'
    actors = {}
    titles = cast_page_soup.find_all('td', class_='primary_photo')
    
    for row in titles:
        actor_name = row.find('img')['alt']
        actor_url = row.find('a')['href']
        act_url = urllib.parse.urljoin(url, actor_url)
        actors[actor_name] = act_url

    if num_of_actors_limit is not None:
        return list(actors.items())[:num_of_actors_limit]
    if num_of_actors_limit is None or num_of_actors_limit > len(actors):
        return list(actors.items())


In [355]:
def get_movies_by_actor_soup(actor_page_soup, num_of_movies_limit=None):
        
        url='https://www.imdb.com'
        
        try:
            actor_element = actor_page_soup.find(class_ = 'filmo-section-actor')
            actor_element_parent = actor_element.parent
        except:
            actor_element = actor_page_soup.find(class_ = 'filmo-section-actress')
            actor_element_parent = actor_element.parent
            
        actor_element_parent.find_all(class_ = 'ipc-metadata-list-summary-item__t')

        exclude_list = [
    'TV Series', 'Short', 'Video Game', 'Video short', 'Video', 'TV Movie',
    'TV Mini Series', 'TV Mini-Series', 'TV Series short', 'TV Special', 'Music Video']

        results = {}
        # exclude with filters
        for row in actor_element_parent.find_all('div', class_='ipc-metadata-list-summary-item__tc'):
            spans = row.find_all('span')
            should_be_excluded = False
            for span in spans:
                text = span.text.strip()
                if text in exclude_list:
                    should_be_excluded = True
                    break

            if should_be_excluded:
                continue

            link = row.find('a')
            # skip
            if not link['href'].endswith('_act'):
                continue

            title = link.text
            href = link['href']

            # skip unreleased movies
            if 'unrel' in href:
                continue

            movie_url = urllib.parse.urljoin(url, href)
            
            if title != '':
                results[title] = movie_url

        # checking of num_of_movies_limit      
        if num_of_movies_limit is not None:
            return list(results.items())[:num_of_movies_limit]
        if  num_of_movies_limit is None or num_of_movies_limit > len(results):
            return list(results.items())



In [356]:
def imdb_helper_cast(url, num_of_movies_limit=None):
    response = requests.get(url)
    soup = BeautifulSoup(response.text)
    cast = get_actors_by_movie_soup(soup, num_of_movies_limit)
    return cast

In [357]:
def imdb_helper_actor(url_actor, num_of_movies_limit=None ):
    options = Options()
    options.add_argument("--disable-infobars")
    options.add_argument("User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6.1 Safari/605.1.15")
    driver = webdriver.Chrome(options=options)


    driver.get(f"{url_actor}")

    wait = WebDriverWait(driver, 4)
    element = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[span = 'See all']")))
    driver.execute_script("arguments[0].click();", element)
    time.sleep(3)

    soup_actor = BeautifulSoup(driver.page_source)
    actor_page = get_movies_by_actor_soup(soup_actor, num_of_movies_limit)
    return actor_page

<h3>RESULTS FOR FUNCTIONS</h3> 

In [358]:
requests_cast = [
    ('https://www.imdb.com/title/tt3480822/fullcredits',),
    ('https://www.imdb.com/title/tt3480822/fullcredits', 150),
    ('https://www.imdb.com/title/tt3480822/fullcredits', 5)
]
for request in requests_cast:
    result = imdb_helper_cast(*request)
    print('the first 10 results:', result[:10])
    print('amount of elements:', len(result))

the first 10 results: [('Scarlett Johansson', 'https://www.imdb.com/name/nm0424060/'), ('Florence Pugh', 'https://www.imdb.com/name/nm6073955/'), ('Rachel Weisz', 'https://www.imdb.com/name/nm0001838/'), ('David Harbour', 'https://www.imdb.com/name/nm1092086/'), ('Ray Winstone', 'https://www.imdb.com/name/nm0935653/'), ('Ever Anderson', 'https://www.imdb.com/name/nm8349839/'), ('Violet McGraw', 'https://www.imdb.com/name/nm8627157/'), ('O-T Fagbenle', 'https://www.imdb.com/name/nm1282966/'), ('William Hurt', 'https://www.imdb.com/name/nm0000458/'), ('Olga Kurylenko', 'https://www.imdb.com/name/nm1385871/')]
amount of elements: 107
the first 10 results: [('Scarlett Johansson', 'https://www.imdb.com/name/nm0424060/'), ('Florence Pugh', 'https://www.imdb.com/name/nm6073955/'), ('Rachel Weisz', 'https://www.imdb.com/name/nm0001838/'), ('David Harbour', 'https://www.imdb.com/name/nm1092086/'), ('Ray Winstone', 'https://www.imdb.com/name/nm0935653/'), ('Ever Anderson', 'https://www.imdb.com/

In [359]:
requests_actor = [
    ('https://www.imdb.com/name/nm0425005/',),
    ('https://www.imdb.com/name/nm0425005/', 100),
    ('https://www.imdb.com/name/nm0425005/', 5),
    ('https://www.imdb.com/name/nm0424060/',),
    ('https://www.imdb.com/name/nm0424060/', 100),
    ('https://www.imdb.com/name/nm0424060/', 5)
    
]
for request in requests_actor:
    result = imdb_helper_actor(*request)
    print('the first 10 results:', result[:10])
    print('amount of elements:', len(result))
    

the first 10 results: [('Fast X', 'https://www.imdb.com/title/tt5433140/?ref_=nm_flmg_t_2_act'), ('Black Adam', 'https://www.imdb.com/title/tt6443346/?ref_=nm_flmg_t_5_act'), ('Super Pets', 'https://www.imdb.com/title/tt8912936/?ref_=nm_flmg_t_6_act'), ('Red Notice', 'https://www.imdb.com/title/tt7991608/?ref_=nm_flmg_t_8_act'), ('Free Guy', 'https://www.imdb.com/title/tt6264654/?ref_=nm_flmg_t_9_act'), ('Jungle Cruise', 'https://www.imdb.com/title/tt0870154/?ref_=nm_flmg_t_10_act'), ('Jumanji: The Next Level', 'https://www.imdb.com/title/tt7975244/?ref_=nm_flmg_t_14_act'), ('Fast & Furious: Hobbs & Shaw', 'https://www.imdb.com/title/tt6806448/?ref_=nm_flmg_t_17_act'), ('Fighting with My Family', 'https://www.imdb.com/title/tt6513120/?ref_=nm_flmg_t_18_act'), ('Skyscraper', 'https://www.imdb.com/title/tt5758778/?ref_=nm_flmg_t_22_act')]
amount of elements: 46
the first 10 results: [('Fast X', 'https://www.imdb.com/title/tt5433140/?ref_=nm_flmg_t_2_act'), ('Black Adam', 'https://www.imd