# MOVIE ANALYSIS
In starting this independent research project, my aim is to retrieve Rotten Tomato scores for movies and shows on streaming platforms.  

#### TABLE OF CONTENTS
[Part 1: Library/Module Import](#Part-1:-Library/Module-Import) 

[Part 2: Extract & Load Data](#Part-2:-Extract-&-Load-Data)

[Part 3: Quick Data Inspection](#Part-3:-Quick-Data-Inspection)

[Part 4: Clean data](#Part-4:-Clean-data)

[Part 5: EDA](#Part-5:-EDA)

[Part 6: Regression](#Part-6:-Regression)

#### Sources: 
1. Steps for data extraction: https://medium.com/geekculture/how-to-collect-data-from-imdb-explore-it-b669f56b7dfb
2. IMDb: 

# Part 1: Library/Module Import

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm 

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

from bs4 import BeautifulSoup
from requests import get

# Part 2: Extract & Load Data

In [2]:
#grabbing all the data in the site
rotten_tomato_response = get('https://www.rottentomatoes.com/browse/movies_at_home/?page=5')

#passing the site's HTML to beautifulsoup to parse the data so scraping is easy
soup = BeautifulSoup(rotten_tomato_response.text, 'html.parser')

#initializing lists to store the parsed values from the loop below
titles = []
dates = []
audience_scores = []
tomato_meters = []

#finding all movie containers from the site
#wrong code: movies = soup.find_all('div', class_ = 'movie_info')
movies = soup.find_all('a', {'data-track': 'scores'})


#iterating through items of interest to get titles, dates, and scores
for movie in movies:
    title = movie.find('span', {'data-qa': 'discovery-media-list-item-title'}).text.strip()
    date = movie.find('span', {'data-qa': 'discovery-media-list-item-start-date'}).text.strip()
    
    scores = movie.find('score-pairs-deprecated')
    audience_score = scores['audiencescore']
    tomato_meter = scores['criticsscore']
    
    titles.append(title)
    dates.append(date)
    audience_scores.append(audience_score)
    tomato_meters.append(tomato_meter)

    
#creating a dataframe with info from above
data = {'title': titles, 'stream_release_date': dates, 'audience_score': audience_scores, 'tomato_meter': tomato_meters}
rotten_tomato_df = pd.DataFrame(data)

rotten_tomato_df

Unnamed: 0,title,stream_release_date,audience_score,tomato_meter
0,Poor Things,"Streaming Feb 27, 2024",79,92
1,Monster,"Streaming Feb 27, 2024",92,96
2,Lisa Frankenstein,"Streaming Feb 27, 2024",82,51
3,Out of Darkness,"Streaming Feb 27, 2024",54,85
4,"What the Hell Happened to Blood, Sweat & Tears?","Streaming Feb 27, 2024",,82
...,...,...,...,...
106,Lift,"Streaming Jan 12, 2024",31,29
107,Strays,"Streaming Sep 5, 2023",69,53
108,Blade Runner 2049,"Streaming Dec 26, 2017",88,88
109,DarkGame,"Streaming Feb 20, 2024",80,


In [3]:
rotten_tomato_df['title'].unique()

array(['Poor Things', 'Monster', 'Lisa Frankenstein', 'Out of Darkness',
       'What the Hell Happened to Blood, Sweat & Tears?',
       'As We Speak: Rap Music on Trial',
       'Pathological: The Lies of Joran Van Der Sloot',
       'The Irritable Heart', 'Runaway Radio', 'Code 8: Part II',
       'Butchers Book Two: Raghorn', 'Spaceman',
       'Megamind vs. the Doom Syndicate', "Amelia's Children",
       'Sex-Positive', 'Scrambled', 'Dune', 'The Zone of Interest',
       'All of Us Strangers', 'Mea Culpa', 'Anyone But You',
       'Double Blind', 'American Fiction', 'Anatomy of a Fall',
       'The Last Airbender', 'The Beekeeper', 'The Holdovers',
       'Aquaman and the Lost Kingdom', 'Oppenheimer', 'The Promised Land',
       'Wonka', 'The Iron Claw', 'This Is Me... Now: A Love Story',
       'Killers of the Flower Moon', 'Mean Girls', 'Priscilla',
       'Past Lives', 'Thanksgiving', 'Lover, Stalker, Killer', 'Saltburn',
       'Memory', 'The Marvels', 'Red Right Hand', 'Upgr

In [43]:
def make_url_ending(df):
    url_endings = []
    for i, row in df.iterrows():
        title = row['title'].lower().replace(' ', '_').replace(':', '_').replace('-', '_').replace(
            '?', '').replace('...', '_').replace('__', '_').replace("'", '').replace('.', '').replace(
            ',', '').replace('&', 'and')
        url_endings.append(title)
    return url_endings
        
make_url_ending(rotten_tomato_df)

#scrape_movie_page(url)

['poor_things',
 'monster',
 'lisa_frankenstein',
 'out_of_darkness',
 'what_the_hell_happened_to_blood_sweat_and_tears',
 'as_we_speak_rap_music_on_trial',
 'pathological_the_lies_of_joran_van_der_sloot',
 'the_irritable_heart',
 'runaway_radio',
 'code_8_part_ii',
 'butchers_book_two_raghorn',
 'spaceman',
 'megamind_vs_the_doom_syndicate',
 'amelias_children',
 'sex_positive',
 'scrambled',
 'dune',
 'the_zone_of_interest',
 'all_of_us_strangers',
 'mea_culpa',
 'anyone_but_you',
 'double_blind',
 'american_fiction',
 'anatomy_of_a_fall',
 'the_last_airbender',
 'the_beekeeper',
 'the_holdovers',
 'aquaman_and_the_lost_kingdom',
 'oppenheimer',
 'the_promised_land',
 'wonka',
 'the_iron_claw',
 'this_is_me_now_a_love_story',
 'killers_of_the_flower_moon',
 'mean_girls',
 'priscilla',
 'past_lives',
 'thanksgiving',
 'lover_stalker_killer',
 'saltburn',
 'memory',
 'the_marvels',
 'red_right_hand',
 'upgraded',
 'code_8',
 'napoleon',
 'migration',
 'leave_the_world_behind',
 'ferrar

In [44]:
def make_scraping_url(df):
    urls = []
    url_endings = make_url_ending(df)
    for url_ending in url_endings: 
        url = 'https://www.rottentomatoes.com/m/' + url_ending
        urls.append(url)
    return urls

urls = make_scraping_url(rotten_tomato_df)
urls

['https://www.rottentomatoes.com/m/poor_things',
 'https://www.rottentomatoes.com/m/monster',
 'https://www.rottentomatoes.com/m/lisa_frankenstein',
 'https://www.rottentomatoes.com/m/out_of_darkness',
 'https://www.rottentomatoes.com/m/what_the_hell_happened_to_blood_sweat_and_tears',
 'https://www.rottentomatoes.com/m/as_we_speak_rap_music_on_trial',
 'https://www.rottentomatoes.com/m/pathological_the_lies_of_joran_van_der_sloot',
 'https://www.rottentomatoes.com/m/the_irritable_heart',
 'https://www.rottentomatoes.com/m/runaway_radio',
 'https://www.rottentomatoes.com/m/code_8_part_ii',
 'https://www.rottentomatoes.com/m/butchers_book_two_raghorn',
 'https://www.rottentomatoes.com/m/spaceman',
 'https://www.rottentomatoes.com/m/megamind_vs_the_doom_syndicate',
 'https://www.rottentomatoes.com/m/amelias_children',
 'https://www.rottentomatoes.com/m/sex_positive',
 'https://www.rottentomatoes.com/m/scrambled',
 'https://www.rottentomatoes.com/m/dune',
 'https://www.rottentomatoes.com/

In [45]:
#grabbing all the data in the site
#movie_response = get('https://www.rottentomatoes.com/m/poor_things')

#passing the site's HTML to beautifulsoup to parse the data so scraping is easy
#soup = BeautifulSoup(movie_response.text, 'html.parser')

def scrape_movie_page(urls):
    movie_info_dfs = []
#initializing lists to store the parsed values
    titles = []
    descriptions = []
    genres = []
    ratings = []
    original_languages = []
    directors = []
    producers = []
    writers = []
    theater_releases = []
    us_box_offices = []
    runtimes = []
    distributors = []
    prod_companies = []
    sound_mixes = []
    aspect_ratios = []
  #looping through the list of URL's  
    for url in urls:
        response = get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        #title
        title_label = soup.find('h1', {'data-qa': 'score-panel-title'})
        if title_label: 
            title_value = title_label.text.strip()
        else:
            title_value = None
        titles.append(title_value)
    
        #description
        find_description = soup.find('p', {'data-qa': 'movie-info-synopsis'})
        if find_description: 
            description = find_description.text.strip()
        else:
            description = None
        descriptions.append(description)

        #genre
        genre_label = soup.find('b', {'data-qa': 'movie-info-item-label'}, string='Genre:')
        genre_value = None
        if genre_label: 
            genre_value = genre_label.find_next_sibling('span', {'data-qa': 'movie-info-item-value'}).text.strip()
        genres.append(genre_value)

        #rating 
        rating_label = soup.find('b', {'data-qa': 'movie-info-item-label'}, string='Rating:')
        rating_value = None
        if rating_label: 
            rating_value = rating_label.find_next_sibling('span', {'data-qa': 'movie-info-item-value'}).text.strip()
        ratings.append(rating_value)

        #original language
        language_label = soup.find('b', {'data-qa': 'movie-info-item-label'}, string='Original Language:')
        language_value = None
        if language_label: 
            language_value = language_label.find_next_sibling('span', {'data-qa': 'movie-info-item-value'}).text.strip()
        original_languages.append(language_value)

        #director
        director_label = soup.find('b', {'data-qa': 'movie-info-item-label'}, string='Director:')
        director_value = None
        if director_label: 
            director_value = director_label.find_next_sibling('span', {'data-qa': 'movie-info-item-value'}).text.strip()
        directors.append(director_value)

        #producer    
        producer_label = soup.find('b', {'data-qa': 'movie-info-item-label'}, string='Producer:')
        producer_value = None
        if producer_label: 
            producer_value = producer_label.find_next_sibling('span', {'data-qa': 'movie-info-item-value'}).text.strip()
        producers.append(producer_value)
    
        #writer
        writer_label = soup.find('b', {'data-qa': 'movie-info-item-label'}, string='Writer:')
        writer_value = None
        if writer_label: 
            writer_value = writer_label.find_next_sibling('span', {'data-qa': 'movie-info-item-value'}).text.strip() 
        writers.append(writer_value)  
    
        #theater release  
        theater_release_label = soup.find('b', {'data-qa': 'movie-info-item-label'}, string='Release Date (Theaters):')
        theater_release_value = None
        if theater_release_label: 
            theater_release_value = theater_release_label.find_next_sibling('span', {'data-qa': 'movie-info-item-value'}).text.strip()
        theater_releases.append(theater_release_value)
    
        #box office  
        us_box_office_label = soup.find('b', {'data-qa': 'movie-info-item-label'}, string='Box Office (Gross USA):')
        us_box_office_value = None
        if us_box_office_label: 
            us_box_office_value = us_box_office_label.find_next_sibling('span', {'data-qa': 'movie-info-item-value'}).text.strip()
        us_box_offices.append(us_box_office_value)
    
        #trt
        runtime_label = soup.find('b', {'data-qa': 'movie-info-item-label'}, string='Runtime:')
        runtime_value = None
        if runtime_label: 
            runtime_value = runtime_label.find_next_sibling('span', {'data-qa': 'movie-info-item-value'}).text.strip()
        runtimes.append(runtime_value)
    
        #distributors
        distributor_label = soup.find('b', {'data-qa': 'movie-info-item-label'}, string='Distributor:')
        distributor_value = None
        if distributor_label: 
            distributor_value = distributor_label.find_next_sibling('span', {'data-qa': 'movie-info-item-value'}).text.strip()
        distributors.append(distributor_value) 

        #production comapnies
        prod_company_label = soup.find('b', {'data-qa': 'movie-info-item-label'}, string='Production Co:')
        prod_company_value = None
        if prod_company_label: 
            prod_company_value = prod_company_label.find_next_sibling('span', {'data-qa': 'movie-info-item-value'}).text.strip()
        prod_companies.append(prod_company_value) 

        #sound mixing tech
        sound_mix_label = soup.find('b', {'data-qa': 'movie-info-item-label'}, string='Sound Mix:')
        sound_mix_value = None
        if sound_mix_label: 
            sound_mix_value = sound_mix_label.find_next_sibling('span', {'data-qa': 'movie-info-item-value'}).text.strip()
        sound_mixes.append(sound_mix_value)
    
        #ratio 
        aspect_ratio_label = soup.find('b', {'data-qa': 'movie-info-item-label'}, string='Aspect Ratio:')
        aspect_ratio_value = None
        if aspect_ratio_label: 
            aspect_ratio_value = aspect_ratio_label.find_next_sibling('span', {'data-qa': 'movie-info-item-value'}).text.strip()
        aspect_ratios.append(aspect_ratio_value)
    
        #creating a dataframe with info from above
    info_data = {'title': titles, 'description': descriptions, 'genres': genres, 'mpa_ratings': ratings, 
        'og_language': original_languages, 'director(s)': directors, 'producer(s)': producers,
       'writer(s)': writers, 'theater_release_date': theater_releases, 'domestic_box': us_box_offices,
       'runtime': runtimes, 'distributing_co': distributors, 'production_co': prod_companies,
       'sound_mix': sound_mixes, 'aspect_ratio': aspect_ratios}
        
    movie_info_df = pd.DataFrame(info_data)
    return movie_info_df

In [46]:
movie_info = scrape_movie_page(urls)
movie_info

Unnamed: 0,title,description,genres,mpa_ratings,og_language,director(s),producer(s),writer(s),theater_release_date,domestic_box,runtime,distributing_co,production_co,sound_mix,aspect_ratio
0,Poor Things,From filmmaker Yorgos Lanthimos and producer E...,"Comedy, \n \n ...",R (Gore|Disturbing Material|Graphic Nudity|Lan...,English,Yorgos Lanthimos,"Yorgos Lanthimos, \n ...",Tony McNamara,"Dec 8, 2023\n limited",$32.9M,2h 21m,Searchlight Pictures,Element Pictures,Dolby Digital,Flat (1.66:1)
1,Monster,"Shortly after moving to Florida, longtime pros...","Biography, \n \n ...",R,English,Patty Jenkins,"Mark Damon, \n ...",Patty Jenkins,"Jan 30, 2004\n wide",$34.5M,1h 49m,Newmarket Film Group,"K/W Productions, \n ...","Dolby SR, DTS, Surround",Flat (1.85:1)
2,Lisa Frankenstein,A coming of RAGE love story from acclaimed wri...,"Horror, \n \n ...",PG-13 (Violent Content|Sexual Assault|Language...,English,Zelda Williams,"Mason Novick, \n ...",Diablo Cody,"Feb 9, 2024\n wide",$9.4M,1h 41m,Focus Features,MXN Entertainment,Dolby Digital,Flat (1.85:1)
3,Out of Darkness,Dr. Albert Schweitzer and his wife face a vari...,Drama,PG,English,Gray Hofmeyr,,,,,1h 29m,,,Surround,
4,"What the Hell Happened to Blood, Sweat & Tears?","Blood, Sweat & Tears, known for hits such as ""...","Documentary, \n \n ...",,English,John Scheinfeld,"Dave Harding, \n ...",John Scheinfeld,"Mar 24, 2023\n limited",,1h 52m,Abramorama,Jesse James Films,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,Lift,A routine working day turns unusual for Guru a...,"Mystery & thriller, \n ...",,Tamil,Vineeth,,,,,2h 14m,,,,
107,Strays,A criminal (Vin Diesel) tries to change his wa...,Drama,R (Drug Material|Strong Language|Some Sexual C...,English,Vin Diesel,,Vin Diesel,,,1h 45m,,One Race Productions,,
108,Blade Runner 2049,"Officer K (Ryan Gosling), a new blade runner f...","Sci-fi, \n \n ...",R (Nudity|Language|Some Sexuality|Violence),English,Denis Villeneuve,"Andrew A. Kosove, \n ...","Hampton Fancher, \n ...","Oct 6, 2017\n wide",$91.5M,2h 44m,Warner Bros. Pictures,"Torridon Films, \n ...",Dolby Atmos,Scope (2.35:1)
109,DarkGame,A police detective races to save the captives ...,"Horror, \n \n ...",,English (United Kingdom),Howard J. Ford,Tom George,"Gary Grant, \n ...",,,1h 41m,Gravitas Ventures,"Happy Hour Films, \n ...",,


In [47]:
movie_info.head()

Unnamed: 0,title,description,genres,mpa_ratings,og_language,director(s),producer(s),writer(s),theater_release_date,domestic_box,runtime,distributing_co,production_co,sound_mix,aspect_ratio
0,Poor Things,From filmmaker Yorgos Lanthimos and producer E...,"Comedy, \n \n ...",R (Gore|Disturbing Material|Graphic Nudity|Lan...,English,Yorgos Lanthimos,"Yorgos Lanthimos, \n ...",Tony McNamara,"Dec 8, 2023\n limited",$32.9M,2h 21m,Searchlight Pictures,Element Pictures,Dolby Digital,Flat (1.66:1)
1,Monster,"Shortly after moving to Florida, longtime pros...","Biography, \n \n ...",R,English,Patty Jenkins,"Mark Damon, \n ...",Patty Jenkins,"Jan 30, 2004\n wide",$34.5M,1h 49m,Newmarket Film Group,"K/W Productions, \n ...","Dolby SR, DTS, Surround",Flat (1.85:1)
2,Lisa Frankenstein,A coming of RAGE love story from acclaimed wri...,"Horror, \n \n ...",PG-13 (Violent Content|Sexual Assault|Language...,English,Zelda Williams,"Mason Novick, \n ...",Diablo Cody,"Feb 9, 2024\n wide",$9.4M,1h 41m,Focus Features,MXN Entertainment,Dolby Digital,Flat (1.85:1)
3,Out of Darkness,Dr. Albert Schweitzer and his wife face a vari...,Drama,PG,English,Gray Hofmeyr,,,,,1h 29m,,,Surround,
4,"What the Hell Happened to Blood, Sweat & Tears?","Blood, Sweat & Tears, known for hits such as ""...","Documentary, \n \n ...",,English,John Scheinfeld,"Dave Harding, \n ...",John Scheinfeld,"Mar 24, 2023\n limited",,1h 52m,Abramorama,Jesse James Films,,


In [39]:
#ensure no null titles are present
movie_info[movie_info['title'] == 'None']

Unnamed: 0,title,description,genres,mpa_ratings,og_language,director(s),producer(s),writer(s),theater_release_date,domestic_box,runtime,distributing_co,production_co,sound_mix,aspect_ratio


In [50]:
def scrape_movie_page(url):
    # Initialize lists to store parsed values
    descriptions = []
    genres = []
    ratings = []
    original_languages = []
    directors = []
    producers = []
    writers = []
    theater_releases = []
    us_box_offices = []
    runtimes = []
    distributors = []
    prod_companies = []
    sound_mixes = []
    aspect_ratios = []

    # Fetch the HTML content of the movie page
    movie_response = get(url)
    soup = BeautifulSoup(movie_response.text, 'html.parser')

    # Extract information from the HTML
    description = soup.find('p', {'data-qa': 'movie-info-synopsis'}).text.strip()
    descriptions.append(description)

    # Extract other attributes similarly (use try-except to handle missing values)

    # Create DataFrame
    data = {
        'Description': descriptions,
        'Rating': ratings,
        'Genre': genres,
        'Original Language': original_languages,
        'Director': directors,
        'Producer': producers,
        'Writer': writers,
        'Theater Release Date': theater_releases,
        'US Box Office': us_box_offices,
        'Runtime': runtimes,
        'Distributor': distributors,
        'Production Company': prod_companies,
        'Sound Mix': sound_mixes,
        'Aspect Ratio': aspect_ratios
    }
    return pd.DataFrame(data)

def scrape_rotten_tomatoes(url):
    # Initialize lists to store parsed values
    titles = []
    dates = []
    audience_scores = []
    tomato_meters = []

    # Fetch the HTML content of the streaming page
    response = get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    movies = soup.find_all('a', {'data-track': 'scores'})

    # Iterate through items of interest to get titles, dates, and scores
    for movie in movies:
        title = movie.find('span', {'data-qa': 'discovery-media-list-item-title'}).text.strip()
        date = movie.find('span', {'data-qa': 'discovery-media-list-item-start-date'}).text.strip()

        scores = movie.find('score-pairs-deprecated')
        audience_score = scores['audiencescore']
        tomato_meter = scores['criticsscore']

        titles.append(title)
        dates.append(date)
        audience_scores.append(audience_score)
        tomato_meters.append(tomato_meter)

    # Create DataFrame
    data = {
        'Title': titles,
        'Stream Release Date': dates,
        'Audience Score': audience_scores,
        'Tomato Meter': tomato_meters
    }
    rotten_tomato_df = pd.DataFrame(data)

    # Scrape individual movie pages and concatenate the results
    movie_info_dfs = []
    for index, row in rotten_tomato_df.iterrows():
        movie_url = f"https://www.rottentomatoes.com/m/{row['Title'].replace(' ', '_')}"
        movie_info_df = scrape_movie_page(movie_url)
        movie_info_dfs.append(movie_info_df)
    
    return pd.concat(movie_info_dfs, ignore_index=True)

# Scrape Rotten Tomatoes streaming page and get movie information
url = 'https://www.rottentomatoes.com/browse/movies_at_home/?page=5'
rotten_tomato_df = scrape_rotten_tomatoes(url)
print(rotten_tomato_df)

ValueError: All arrays must be of the same length

# Part 3: Quick Data Inspection

# Part 4: Clean data

# Part 5: EDA

# Part 6: Regression