In [None]:
# Movies General Information

import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_imdb_data(url):
    data = {'Genre': [], 'Director(s)': [], 'Writer(s)': [], 'Stars': [], 'IMDB Rating': [], 'Metascore': [], 'Link to official Trailer': [],'Production Companies': [], 'Budget': [], 'Opening weekend' : []}

    headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"}
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, 'html.parser')

    # Genre
    data_testid = 'genres'
    data_span = "ipc-chip__text"
    elements_with_data_testid = soup.find_all(attrs={'data-testid': data_testid})
    if elements_with_data_testid:
        text_list = []
        for element in elements_with_data_testid:
            spans = element.find_all('span', class_=data_span)
        for span in spans:
            span_text = span.get_text(strip=True)
            text_list.append(span_text)
        result_text = ' ; '.join(text_list)
        data["Genre"].append(result_text)
    else:
        data["Genre"].append(None)

    # Director(s)
    container = soup.find(class_="ipc-metadata-list-item__content-container")
    director_elements = container.find_all(class_="ipc-metadata-list-item__list-content-item--link") if container else []
    director_names = [element.get_text() for element in director_elements]
    joined_directors = " ; ".join(director_names)
    data["Director(s)"].append(joined_directors if joined_directors else None)

    # Writer(s)
    target_div_class = "ipc-metadata-list-item__content-container"
    target_element_class = "ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link"
    target_divs = soup.find_all('div', class_=target_div_class)
    if len(target_divs) >= 1:
        target_div = target_divs[1]
        target_elements = target_div.find_all(class_=target_element_class)
        if target_elements:
            scraped_strings = [element.get_text() for element in target_elements]
            result_text = ' ; '.join(scraped_strings)
            data["Writer(s)"].append(result_text)
        else:
            data["Writer(s)"].append(None)
    else:
        data["Writer(s)"].append(None)

    # Stars
    target_class = "sc-bfec09a1-1 gCQkeh"
    elements = soup.find_all(class_=target_class)
    if elements:
        scraped_strings = [element.get_text(strip=True) for element in elements]
        joined_result = ' ; '.join(scraped_strings)
        data["Stars"].append(joined_result)
    else:
        data["Stars"].append(None)

    # IMDB Rating
    target_class = "sc-bde20123-1 cMEQkK"
    element = soup.find(class_=target_class)
    if element:
        scraped_string = element.get_text()
        data["IMDB Rating"].append(scraped_string)
    else:
        data["IMDB Rating"].append(None)

    # Metascore
    target_class = "sc-b0901df4-0 bcQdDJ metacritic-score-box"
    element = soup.find(class_=target_class)
    if element:
        scraped_string = element.get_text()
        data["Metascore"].append(scraped_string)
    else:
        data["Metascore"].append(None)

    # Link to official Trailer
    data_testid = 'videos-slate-overlay-1'
    base_url = 'https://www.imdb.com/'
    element_with_data_testid = soup.find(attrs={'data-testid': data_testid})
    if element_with_data_testid:
        href_link = element_with_data_testid.get('href')
        if href_link:
            full_url = base_url + href_link
            data["Link to official Trailer"].append(full_url)
        else:
            data["Link to official Trailer"].append(None)
    else:
        data["Link to official Trailer"].append(None)

    # Production Companies
    data_testid = 'title-details-companies'
    data_a = 'ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link'
    elements_with_data_testid = soup.find_all(attrs={'data-testid': data_testid})
    if elements_with_data_testid:
        text_list = []
        for element in elements_with_data_testid:
            spans = element.find_all('a', class_=data_a)
            for span in spans:
                span_text = span.get_text(strip=True)
                text_list.append(span_text)
        result_text = ' ; '.join(text_list)
        data["Production Companies"].append(result_text)
    else:
        data["Production Companies"].append(None)

    # Budget
    data_testid = 'title-boxoffice-budget'
    data_span = 'ipc-metadata-list-item__list-content-item'
    elements_with_data_testid = soup.find_all(attrs={'data-testid': data_testid})
    if elements_with_data_testid:
        text_list = []
        for element in elements_with_data_testid:
            spans = element.find_all('span', class_=data_span)
            for span in spans:
                span_text = span.get_text(strip=True)
                text_list.append(span_text)
        result_text = ' ; '.join(text_list)
        data["Budget"].append(result_text)
    else:
        data["Budget"].append(None)

    # Opening weekend
    data_testid = 'title-boxoffice-openingweekenddomestic'
    elements_with_data_testid = soup.find_all(attrs={'data-testid': data_testid})
    if elements_with_data_testid:
        text_list = []
        for element in elements_with_data_testid:
            spans = element.find_all('span', class_=data_span)
            for span in spans:
                span_text = span.get_text(strip=True)
                text_list.append(span_text)
        result_text = ' ; '.join(text_list)
        data["Opening weekend"].append(result_text)
    else:
        data["Opening weekend"].append(None)

    return data

urls =  [
    "https://www.imdb.com/title/tt0119174/?ref_=fn_al_tt_3",
    "https://www.imdb.com/title/tt2224026/?ref_=fn_al_tt_1",
    "https://www.imdb.com/title/tt0116483/?ref_=fn_al_tt_1",
    "https://www.imdb.com/title/tt0325980/?ref_=fn_al_tt_1",
    "https://www.imdb.com/title/tt1014759/?ref_=fn_al_tt_1",
    "https://www.imdb.com/title/tt0082406/?ref_=fn_al_tt_1",
    "https://www.imdb.com/title/tt0499448/?ref_=fn_al_tt_1",
    "https://www.imdb.com/title/tt0363771/?ref_=fn_al_tt_1"
]

combined_df = pd.DataFrame()

for url in urls:
    data = scrape_imdb_data(url)
    df = pd.DataFrame.from_dict(data)
    combined_df = pd.concat([combined_df, df], ignore_index=True)

combined_df.to_csv('combined_output.csv', index=False)


In [None]:
# Movies summary

import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_imdb_summary_and_synopsis(url):
    headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"}
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, 'html.parser')
    content_elements = soup.find_all(class_="ipc-html-content-inner-div")
    summary = content_elements[0].get_text(strip=True) if content_elements else None
    synopsis = content_elements[1].get_text(strip=True) if len(content_elements) > 1 else None
    return {'Summary': summary, 'Synopsis': synopsis}

def scrape(urls, output_file='output_summary.csv'):
    data_list = []
    for url in urls:
        result = scrape_imdb_summary_and_synopsis(url)
        data_list.append(result)
    df = pd.DataFrame(data_list)
    df.to_csv(output_file, index=False)

urls = ["https://www.imdb.com/title/tt0119174/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt2224026/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt0116483/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt2091935/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt0108255/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt12412888/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt3794354/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt0418279/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt0092106/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt5090568/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt1399103/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt2109248/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt3371366/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt1055369/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt4701182/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt2293640/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt5113044/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt0267913/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt0331632/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt4131800/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt0100758/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt1291150/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt8589698/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt3949660/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt0088885/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt1046173/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt8404256/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt1583421/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt1979388/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt0317705/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt2096673/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt0198781/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt0266543/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt2277860/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt3606756/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt0382932/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt1049413/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt0114709/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt0435761/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt1217209/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt1979376/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt0120363/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt2380307/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt3521164/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt2294629/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt4520988/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt0101414/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt2771200/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt0110357/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt6105098/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt0114148/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt0120762/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt0780521/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt0398286/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt5028340/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt0325980/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt1014759/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt0082406/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt0499448/plotsummary/?ref_=tt_stry_pl",
        "https://www.imdb.com/title/tt0363771/plotsummary/?ref_=tt_stry_pl"
]

scrape(urls, output_file='output_summary_00.csv')

In [None]:
# Movies Awards

import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_awards_data(urls):
    all_data = []

    for url in urls:
        headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"}
        r = requests.get(url, headers=headers)
        soup = BeautifulSoup(r.text, 'html.parser')

        elements_class_t = soup.find_all(class_='ipc-metadata-list-summary-item__t')
        elements_class_li = soup.find_all(class_='ipc-metadata-list-summary-item__li awardCategoryName')

        awards = []
        nominations = []
        for element_t, element_li in zip(elements_class_t, elements_class_li):
            result = f"{element_t.text.strip()} : {element_li.text.strip()}"
            if "Winner" in result:
                awards.append(result)
            elif "Nominee" in result:
                nominations.append(result)

        data = {
            'URL': url,
            'Awards': " ; ".join(awards),
            'Nominations': " ; ".join(nominations)
        }

        all_data.append(data)

    return pd.DataFrame(all_data)

urls_to_scrape = ["https://www.imdb.com/title/tt0184761/awards/?ref_=tt_awd",
                  "https://www.imdb.com/title/tt10101702/awards/?ref_=tt_awd",
                  "https://www.imdb.com/title/tt0092723/awards/?ref_=tt_awd",
                  "https://www.imdb.com/title/tt0090799/awards/?ref_=tt_awd",
                  "https://www.imdb.com/title/tt0284364/awards/?ref_=tt_awd",
                  "https://www.imdb.com/title/tt1447981/awards/?ref_=tt_awd",
                  "https://www.imdb.com/title/tt2948372/awards/?ref_=tt_awd",
                  "https://www.imdb.com/title/tt0120855/awards/?ref_=tt_awd",
                  "https://www.imdb.com/title/tt5109280/awards/?ref_=tt_awd",
                  "https://www.imdb.com/title/tt0058331/awards/?ref_=tt_awd",
                  "https://www.imdb.com/title/tt1014759/awards/?ref_=tt_awd"
]

combined_df = pd.DataFrame()

for url in urls_to_scrape:
    result_df = scrape_awards_data([url])
    combined_df = pd.concat([combined_df, result_df], ignore_index=True)

print(combined_df)
combined_df.to_csv('awards_output_07.csv', index=False)

                                                  URL  \
0   https://www.imdb.com/title/tt0184761/awards/?r...   
1   https://www.imdb.com/title/tt10101702/awards/?...   
2   https://www.imdb.com/title/tt0092723/awards/?r...   
3   https://www.imdb.com/title/tt0090799/awards/?r...   
4   https://www.imdb.com/title/tt0284364/awards/?r...   
5   https://www.imdb.com/title/tt1447981/awards/?r...   
6   https://www.imdb.com/title/tt2948372/awards/?r...   
7   https://www.imdb.com/title/tt0120855/awards/?r...   
8   https://www.imdb.com/title/tt5109280/awards/?r...   
9   https://www.imdb.com/title/tt0058331/awards/?r...   
10  https://www.imdb.com/title/tt1014759/awards/?r...   

                                               Awards  \
0                                                       
1                                                       
2                                                       
3                                                       
4                  1991 Winner

In [None]:
combined_df

Unnamed: 0,Genre,Director(s),Writer(s),Stars,IMDB Rating,Metascore,Link to official Trailer,Production Companies,Budget,Opening weekend
0,,David Fincher,John Brancato ; Michael Ferris,,,,https://www.imdb.com//video/vi3301942297/?ref_...,Polygram Filmed Entertainment ; Propaganda Fil...,"$50,000,000 (estimated)","$14,337,029 ; Sep 14, 1997"
1,,Tim Johnson,Tom J. Astle ; Matt Ember ; Adam Rex,,,,https://www.imdb.com//video/vi2222961433/?ref_...,DreamWorks Animation,"$135,000,000 (estimated)","$52,107,731 ; Mar 29, 2015"
2,,Dennis Dugan,Tim Herlihy ; Adam Sandler,,,,https://www.imdb.com//video/vi3306209817/?ref_...,Universal Pictures ; Brillstein-Grey Entertain...,"$12,000,000 (estimated)","$8,514,125 ; Feb 18, 1996"
3,,Gore Verbinski,Ted Elliott ; Terry Rossio ; Stuart Beattie,,,,https://www.imdb.com//video/vi2529559321/?ref_...,Walt Disney Pictures ; Jerry Bruckheimer Films,"$140,000,000 (estimated)","$46,630,690 ; Jul 13, 2003"
4,,Tim Burton,Linda Woolverton ; Lewis Carroll,,,,https://www.imdb.com//video/vi975438873/?ref_=...,Walt Disney Pictures ; Roth Films ; Team Todd,"$200,000,000 (estimated)","$116,101,023 ; Mar 7, 2010"
5,,Ted Berman ; Richard Rich ; Art Stevens,Daniel P. Mannix ; Larry Clemmons ; Ted Berman,,,,https://www.imdb.com//video/vi319208985/?ref_=...,Walt Disney Animation Studios ; Walt Disney Pr...,"$12,000,000 (estimated)","$4,819,215 ; Mar 27, 1988"
6,,Andrew Adamson,Andrew Adamson ; Christopher Markus ; Stephen ...,,,,https://www.imdb.com//video/vi1793261849/?ref_...,Walt Disney Pictures ; Walden Media ; Ozumi Films,"$225,000,000 (estimated)","$55,034,805 ; May 18, 2008"
7,,Andrew Adamson,Ann Peacock ; Andrew Adamson ; Christopher Markus,,,,https://www.imdb.com//video/vi3727140377/?ref_...,Walt Disney Pictures ; Walden Media,"$180,000,000 (estimated)","$65,556,312 ; Dec 11, 2005"
