In [1]:

# Import libraries


# Data manipulation and analysis
import pandas as pd  # pandas is used for handling and processing data in DataFrame structures
import numpy as np  # numpy is useful for numerical computations and handling arrays
import gzip  # gzip is for handling compressed files

# Data visualization
import matplotlib.pyplot as plt  # matplotlib is used for creating static, interactive, and animated visualizations
import seaborn as sns  # seaborn provides a high-level interface for drawing attractive statistical graphics

# Database interaction
import sqlite3  # sqlite3 is used to connect to SQLite databases
import nbconvert  # nbconvert is used to convert Jupyter Notebooks into various formats
import os
import re

# Set visualization style
sns.set_theme(style="whitegrid")



## 1. **Box Office Mojo Data**

**Overview**: This dataset provides box office revenue and studio-related details.

- **Shape**: 3,387 rows and 5 columns.

### Columns:
- **title**: Movie title (non-null).
- **studio**: Studio responsible for the movie (5 missing values).
- **domestic_gross**: Domestic gross earnings (28 missing values).
- **foreign_gross**: Foreign gross earnings (1,350 missing values, stored as strings).
- **year**: Release year (non-null).

### Key Issues:
- `foreign_gross` is stored as strings, requiring conversion to numeric format.
- Missing values in the `studio` and revenue columns.

In [2]:
# Define the path to your raw zipped data
file_path = 'C:/Users/USER/Desktop/Movie-Project/data/raw/zippedData/bom.movie_gross.csv.gz'

# Load the gzipped CSV directly
bom_gross = pd.read_csv(file_path, compression='gzip')

# Display the first few rows of the data
display(bom_gross.head())
bom_gross.dtypes

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


title              object
studio             object
domestic_gross    float64
foreign_gross      object
year                int64
dtype: object

## 2. **The Numbers Data**

**Overview**: Focuses on production budgets, domestic, and worldwide gross revenue.

- **Shape**: 5,782 rows and 6 columns.

### Columns:
- **id**: Unique identifier for each movie (non-null).
- **release_date**: Movie release date (non-null).
- **movie**: Movie title (non-null).
- **production_budget**: Production budget (stored as strings with commas, requires conversion).
- **domestic_gross** and **worldwide_gross**: Revenue columns stored as strings with commas.

### Key Issues:
- All revenue columns and budgets are in string format, requiring numeric conversion.
- No missing values, but the format needs cleaning for analysis.

---

In [3]:
# Load The Numbers (movie budgets) dataset
tn_budgets = pd.read_csv('C:/Users/USER/Desktop/Movie-Project/data/raw/zippedData/tn.movie_budgets.csv.gz', compression='gzip') 
print("The Numbers Data:")
print(tn_budgets.info())  # Get an overview of the dataset
display(tn_budgets.head())  # Display the first few rows

The Numbers Data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5782 non-null   int64 
 1   release_date       5782 non-null   object
 2   movie              5782 non-null   object
 3   production_budget  5782 non-null   object
 4   domestic_gross     5782 non-null   object
 5   worldwide_gross    5782 non-null   object
dtypes: int64(1), object(5)
memory usage: 271.2+ KB
None


Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


## 3. **Rotten Tomatoes Reviews Data**

**Overview**: Contains reviews, ratings, and publisher information for movies.

- **Shape**: 54,432 rows and 8 columns.

### Columns:
- **id**: Unique identifier for movies (non-null).
- **review**: Textual review (5,563 missing values).
- **rating**: Rating given by critics (13,517 missing values).
- **fresh**: Whether the review is "fresh" or "rotten" (non-null).
- **critic**: Name of the critic (2,722 missing values).
- **top_critic**: Binary flag for top critics (non-null).
- **publisher**: Publisher of the review (309 missing values).
- **date**: Date of the review (non-null).

### Key Issues:
- Missing values in `review`, `rating`, and `critic` columns.
- Some columns may not directly impact the analysis depending on objectives.

---

In [4]:
# Load Rotten Tomatoes Reviews dataset
rt_reviews = pd.read_csv('C:/Users/USER/Desktop/Movie-Project/data/raw/zippedData/rt.reviews.tsv.gz', compression='gzip', sep='\t', encoding='latin-1') 
print("Rotten Tomatoes Reviews Data:")
print(rt_reviews.info())  # Get an overview of the dataset
display(rt_reviews.head(), "\n")  # Display the first few rows

Rotten Tomatoes Reviews Data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54432 entries, 0 to 54431
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          54432 non-null  int64 
 1   review      48869 non-null  object
 2   rating      40915 non-null  object
 3   fresh       54432 non-null  object
 4   critic      51710 non-null  object
 5   top_critic  54432 non-null  int64 
 6   publisher   54123 non-null  object
 7   date        54432 non-null  object
dtypes: int64(2), object(6)
memory usage: 3.3+ MB
None


Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
0,3,A distinctly gallows take on contemporary fina...,3/5,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018"
1,3,It's an allegory in search of a meaning that n...,,rotten,Annalee Newitz,0,io9.com,"May 23, 2018"
2,3,... life lived in a bubble in financial dealin...,,fresh,Sean Axmaker,0,Stream on Demand,"January 4, 2018"
3,3,Continuing along a line introduced in last yea...,,fresh,Daniel Kasman,0,MUBI,"November 16, 2017"
4,3,... a perverse twist on neorealism...,,fresh,,0,Cinema Scope,"October 12, 2017"


'\n'

## 4. **Rotten Tomatoes Movie Info Data**

**Overview**: Provides additional metadata such as genres, directors, runtime, and box office data.

- **Shape**: 1,560 rows and 12 columns.

### Columns:
- **id**: Unique identifier (non-null).
- **synopsis**: Movie synopsis (62 missing values).
- **rating**: MPAA rating (3 missing values).
- **genre**: Movie genre (8 missing values).
- **director**: Director name (199 missing values).
- **writer**: Writer name (449 missing values).
- **theater_date**: Theater release date (359 missing values).
- **dvd_date**: DVD release date (359 missing values).
- **currency** and **box_office**: Currency type and box office earnings (non-null values are very sparse).
- **runtime**: Runtime of the movie (30 missing values).
- **studio**: Studio responsible (sparse).

### Key Issues:
- High number of missing values in `studio`, `currency`, and `box_office`.
- Sparse data may limit the usability of certain columns in the analysis.

---

In [5]:
# Load Rotten Tomatoes Movie Info dataset
rt_info = pd.read_csv('C:/Users/USER/Desktop/Movie-Project/data/raw/zippedData/rt.movie_info.tsv.gz', compression='gzip', sep='\t') 
print("Rotten Tomatoes Movie Info Data:")
print(rt_info.info())  # Get an overview of the dataset
display(rt_info.head(), "\n")  # Display the first few rows

Rotten Tomatoes Movie Info Data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1560 entries, 0 to 1559
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            1560 non-null   int64 
 1   synopsis      1498 non-null   object
 2   rating        1557 non-null   object
 3   genre         1552 non-null   object
 4   director      1361 non-null   object
 5   writer        1111 non-null   object
 6   theater_date  1201 non-null   object
 7   dvd_date      1201 non-null   object
 8   currency      340 non-null    object
 9   box_office    340 non-null    object
 10  runtime       1530 non-null   object
 11  studio        494 non-null    object
dtypes: int64(1), object(11)
memory usage: 146.4+ KB
None


Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971","Sep 25, 2001",,,104 minutes,
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000.0,108 minutes,Entertainment One
2,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,"Sep 13, 1996","Apr 18, 2000",,,116 minutes,
3,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,"Dec 9, 1994","Aug 27, 1997",,,128 minutes,
4,7,,NR,Drama|Romance,Rodney Bennett,Giles Cooper,,,,,200 minutes,


'\n'

### . **TMDB Dataset**

tmdb.movies.csv.gz
Columns: Unnamed: 0, genre_ids, id, original_language, original_title, popularity, release_date, title, vote_average, vote_count
Issues:
Unnamed: 0 appears to be an unnecessary index column.
Check if release_date is properly formatted (likely needs conversion to datetime)

In [6]:
# Load TMDB dataset
tmdb_movies = pd.read_csv('C:/Users/USER/Desktop/Movie-Project/data/raw/zippedData/tmdb.movies.csv.gz', compression='gzip') 
print("TheMovieDB Data:")
print(tmdb_movies.info())  # Get an overview of the dataset
display(tmdb_movies.head(), "\n")  # Display the first few rows

TheMovieDB Data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26517 entries, 0 to 26516
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         26517 non-null  int64  
 1   genre_ids          26517 non-null  object 
 2   id                 26517 non-null  int64  
 3   original_language  26517 non-null  object 
 4   original_title     26517 non-null  object 
 5   popularity         26517 non-null  float64
 6   release_date       26517 non-null  object 
 7   title              26517 non-null  object 
 8   vote_average       26517 non-null  float64
 9   vote_count         26517 non-null  int64  
dtypes: float64(2), int64(3), object(5)
memory usage: 2.0+ MB
None


Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


'\n'

## **DATAA CLEANING**

### 1. **Box Office Mojo Data**

In [7]:
# Replace missing 'studio' with 'Unknown'
bom_gross['studio'] = bom_gross['studio'].fillna('Unknown')

# Convert 'foreign_gross' to numeric by removing non-numeric characters
bom_gross['foreign_gross'] = bom_gross['foreign_gross'].replace('[^0-9]', '', regex=True).astype(float)

# Drop rows with missing `foreign_gross` and `rating_score`
bom_gross.dropna(subset=['foreign_gross'], inplace=True)



In [8]:
# Convert 'year' column to integer type, handling non-convertible values
bom_gross['year'] = pd.to_numeric(bom_gross['year'], errors='coerce').astype('Int64')

# Drop duplicate rows
bom_gross = bom_gross.drop_duplicates()



In [9]:
# Display the first few rows of the data
display(bom_gross.head())
# Check for missing values
print(bom_gross.isnull().sum())

bom_gross.info()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000.0,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000.0,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000.0,2010
3,Inception,WB,292600000.0,535700000.0,2010
4,Shrek Forever After,P/DW,238700000.0,513900000.0,2010


title              0
studio             0
domestic_gross    28
foreign_gross      0
year               0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 2037 entries, 0 to 3353
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           2037 non-null   object 
 1   studio          2037 non-null   object 
 2   domestic_gross  2009 non-null   float64
 3   foreign_gross   2037 non-null   float64
 4   year            2037 non-null   Int64  
dtypes: Int64(1), float64(2), object(2)
memory usage: 97.5+ KB


### 2. **The Numbers Data**

In [10]:
# Remove '$' and ',' from financial columns and convert them to numeric
for col in ['production_budget', 'domestic_gross', 'worldwide_gross']:
    tn_budgets[col] = tn_budgets[col].replace(r'[\$,]', '', regex=True).astype(float)

# Convert 'release_date' to datetime
tn_budgets['release_date'] = pd.to_datetime(tn_budgets['release_date'], errors='coerce')


In [11]:
tn_budgets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   id                 5782 non-null   int64         
 1   release_date       5782 non-null   datetime64[ns]
 2   movie              5782 non-null   object        
 3   production_budget  5782 non-null   float64       
 4   domestic_gross     5782 non-null   float64       
 5   worldwide_gross    5782 non-null   float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(1)
memory usage: 271.2+ KB


### 3. **TMDB Dataset**

In [12]:
# Drop the unnecessary 'Unnamed: 0' column
tmdb_movies.drop(columns=['Unnamed: 0'], inplace=True)

# Convert 'release_date' to datetime
tmdb_movies['release_date'] = pd.to_datetime(tmdb_movies['release_date'], errors='coerce')


In [13]:
tmdb_movies.head()

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


### 4. **Rotten Tomatoes Reviews Data**

In [14]:
# Drop rows where `review` or `rating` is missing
rt_reviews.dropna(subset=['review', 'rating'], inplace=True)

# Fill missing `critic` and `publisher` with "Unknown"
rt_reviews['critic'] = rt_reviews['critic'].fillna('Unknown')
rt_reviews['publisher'] = rt_reviews['publisher'].fillna('Unknown')


# Convert `date` column to datetime
rt_reviews['date'] = pd.to_datetime(rt_reviews['date'], errors='coerce')

In [15]:
# Convert Data Types
# Parse `rating` to extract numeric scores (e.g., '3/5' -> 3.0)
def parse_rating(rating):
    try:
        return float(rating.split('/')[0]) if '/' in rating else None
    except:
        return None

rt_reviews['rating'] = rt_reviews['rating'].apply(parse_rating)

In [16]:
# Simplify `fresh` to Binary
rt_reviews['fresh'] = rt_reviews['fresh'].apply(lambda x: 1 if x == 'fresh' else 0)


In [17]:
# Drop rows with missing  `rating_score`
rt_reviews.dropna(subset=['rating'], inplace=True)

In [18]:
# Remove Duplicates
rt_reviews.drop_duplicates(inplace=True)

In [19]:
# Rename Columns to snake_case
rt_reviews.rename(columns={
    'id': 'review_id',
    'review': 'review_text',
    'rating': 'rating_score',
    'fresh': 'is_fresh',
    'critic': 'critic_name',
    'top_critic': 'is_top_critic',
    'publisher': 'publisher_name',
    'date': 'review_date'
}, inplace=True)

In [20]:
print("Rotten Tomatoes Reviews Data:")
print(rt_reviews.info())  # Get an overview of the dataset
display(rt_reviews.head(), "\n")  # Display the first few rows

Rotten Tomatoes Reviews Data:
<class 'pandas.core.frame.DataFrame'>
Index: 28760 entries, 0 to 54424
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   review_id       28760 non-null  int64         
 1   review_text     28760 non-null  object        
 2   rating_score    28760 non-null  float64       
 3   is_fresh        28760 non-null  int64         
 4   critic_name     28760 non-null  object        
 5   is_top_critic   28760 non-null  int64         
 6   publisher_name  28760 non-null  object        
 7   review_date     28760 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(3), object(3)
memory usage: 2.0+ MB
None


Unnamed: 0,review_id,review_text,rating_score,is_fresh,critic_name,is_top_critic,publisher_name,review_date
0,3,A distinctly gallows take on contemporary fina...,3.0,1,PJ Nabarro,0,Patrick Nabarro,2018-11-10
7,3,Cronenberg is not a director to be daunted by ...,2.0,0,Matt Kelemen,0,Las Vegas CityLife,2013-04-21
12,3,Robert Pattinson works mighty hard to make Cos...,2.0,0,Christian Toto,0,Big Hollywood,2013-01-15
14,3,For those who like their Cronenberg thick and ...,3.0,1,Marty Mapes,0,Movie Habit,2012-10-20
15,3,For better or worse - often both - Cosmopolis ...,3.0,1,Adam Ross,0,The Aristocrat,2012-09-27


'\n'

### 5. **Rotten Tomatoes Movie Info Data**

In [21]:
# Replace all values in the 'currency' column with "USD"
rt_info['currency'] = 'USD'

# Handle missing values
rt_info['synopsis'].fillna('No synopsis available', inplace=True)
rt_info['rating'].fillna('Unknown', inplace=True)
rt_info['genre'].fillna('Unknown', inplace=True)
rt_info['director'].fillna('Unknown', inplace=True)
rt_info['writer'].fillna('Unknown', inplace=True)
rt_info['studio'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  rt_info['synopsis'].fillna('No synopsis available', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  rt_info['rating'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which

In [22]:

# Convert to datetime and coerce invalid dates to NaT
rt_info['theater_date'] = pd.to_datetime(rt_info['theater_date'], errors='coerce')
rt_info['dvd_date'] = pd.to_datetime(rt_info['dvd_date'], errors='coerce')

# Clean 'runtime' to extract numerical values
rt_info['runtime'] = rt_info['runtime'].str.extract('(\d+)').astype(float)


  rt_info['runtime'] = rt_info['runtime'].str.extract('(\d+)').astype(float)


In [23]:
# Clean 'box_office' to extract numerical values
rt_info['box_office'] = rt_info['box_office'].replace('[\$,]', '', regex=True).astype(float)

# Drop rows where 'box_office' or 'runtime' is still NaN if they are crucial for analysis
rt_info.dropna(subset=['box_office', 'runtime'], inplace=True)


  rt_info['box_office'] = rt_info['box_office'].replace('[\$,]', '', regex=True).astype(float)


In [24]:
# Drop rows where 'theater_date' or 'dvd_date' is NaT 
rt_info.dropna(subset=['theater_date', 'dvd_date'], inplace=True)

In [25]:
# Reset the index after dropping rows
rt_info.reset_index(drop=True, inplace=True)


# Display cleaned data info
print("Cleaned Rotten Tomatoes Movie Info Data:")
print(rt_info.info())
display(rt_info.head())

Cleaned Rotten Tomatoes Movie Info Data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 332 entries, 0 to 331
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   id            332 non-null    int64         
 1   synopsis      332 non-null    object        
 2   rating        332 non-null    object        
 3   genre         332 non-null    object        
 4   director      332 non-null    object        
 5   writer        332 non-null    object        
 6   theater_date  332 non-null    datetime64[ns]
 7   dvd_date      332 non-null    datetime64[ns]
 8   currency      332 non-null    object        
 9   box_office    332 non-null    float64       
 10  runtime       332 non-null    float64       
 11  studio        332 non-null    object        
dtypes: datetime64[ns](2), float64(2), int64(1), object(7)
memory usage: 31.3+ KB
None


Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio
0,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,2012-08-17,2013-01-01,USD,600000.0,108.0,Entertainment One
1,10,Some cast and crew from NBC's highly acclaimed...,PG-13,Comedy,Jake Kasdan,Mike White,2002-01-11,2002-06-18,USD,41032915.0,82.0,Paramount Pictures
2,13,"Stewart Kane, an Irishman living in the Austra...",R,Drama,Ray Lawrence,Raymond Carver|Beatrix Christian,2006-04-27,2007-10-02,USD,224114.0,123.0,Sony Pictures Classics
3,14,"""Love Ranch"" is a bittersweet love story that ...",R,Drama,Taylor Hackford,Mark Jacobson,2010-06-30,2010-11-09,USD,134904.0,117.0,Unknown
4,22,Two-time Academy Award Winner Kevin Spacey giv...,R,Comedy|Drama|Mystery and Suspense,George Hickenlooper,Norman Snider,2010-12-17,2011-04-05,USD,1039869.0,108.0,ATO Pictures


In [26]:
# Function to normalize titles
def normalize_title(title):
    if isinstance(title, str):
        title = title.strip().lower()  # Remove whitespace and convert to lowercase
        title = re.sub(r'[^a-z0-9\s]', '', title)  # Remove special characters
    return title

# Apply normalization to title columns
bom_gross['title_normalized'] = bom_gross['title'].apply(normalize_title)
tn_budgets['title_normalized'] = tn_budgets['movie'].apply(normalize_title)
tmdb_movies['title_normalized'] = tmdb_movies['title'].apply(normalize_title)


In [27]:
#Check Overlap Between bom_gross and tmdb_movies:
matched_titles = bom_gross['title_normalized'].isin(tmdb_movies['title_normalized']).sum()
print(f"Number of matched titles between bom_gross and tmdb_movies: {matched_titles}")

Number of matched titles between bom_gross and tmdb_movies: 1574


In [28]:
#Check Overlap Between tn_budgets and tmdb_movies:
matched_titles = tn_budgets['title_normalized'].isin(tmdb_movies['title_normalized']).sum()
print(f"Number of matched titles between tn_budgets and tmdb_movies: {matched_titles}")

Number of matched titles between tn_budgets and tmdb_movies: 2084


In [29]:
#Check for duplicates in title_normalized:

display("Duplicates in bom_gross:", bom_gross[bom_gross['title_normalized'].duplicated()])
display("Duplicates in tn_budgets:", tn_budgets[tn_budgets['title_normalized'].duplicated()])
display("Duplicates in tmdb_movies:", tmdb_movies[tmdb_movies['title_normalized'].duplicated()])

'Duplicates in bom_gross:'

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year,title_normalized


'Duplicates in tn_budgets:'

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,title_normalized
273,74,1998-05-19,Godzilla,125000000.0,136314294.0,376000000.0,godzilla
408,9,2018-11-21,Robin Hood,99000000.0,30824628.0,84747441.0,robin hood
484,85,2005-07-08,Fantastic Four,87500000.0,154696080.0,333132750.0,fantastic four
543,44,1999-05-07,The Mummy,80000000.0,155385488.0,416385488.0,the mummy
707,8,1997-06-13,Hercules,70000000.0,99112101.0,250700000.0,hercules
...,...,...,...,...,...,...,...
5668,69,1942-11-16,Cat People,134000.0,4000000.0,8000000.0,cat people
5676,77,1968-10-01,Night of the Living Dead,114000.0,12087064.0,30087064.0,night of the living dead
5677,78,1915-02-08,The Birth of a Nation,110000.0,10000000.0,11000000.0,the birth of a nation
5699,100,1972-08-30,The Last House on the Left,87000.0,3100000.0,3100000.0,the last house on the left


'Duplicates in tmdb_movies:'

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,title_normalized
781,"[18, 28, 53]",51462,en,Brotherhood,2.235,2010-01-03,Brotherhood,6.2,31,brotherhood
1037,"[35, 18, 10749]",44369,tl,Boy,1.504,2009-06-01,Boy,7.5,2,boy
1230,[],371702,en,All That Glitters,1.241,2010-09-25,All That Glitters,10.0,1,all that glitters
1354,[18],155711,en,After-Life,0.994,2010-01-01,After-Life,5.5,20,afterlife
1501,"[28, 18]",36410,en,Zero,0.840,2010-02-06,Zero,5.8,6,zero
...,...,...,...,...,...,...,...,...,...,...
26495,[],556601,en,Recursion,0.600,2018-08-28,Recursion,2.0,1,recursion
26504,"[27, 35, 27]",534282,en,Head,0.600,2015-03-28,Head,1.0,1,head
26506,[],561861,en,Eden,0.600,2018-11-25,Eden,0.0,1,eden
26510,[99],495045,en,Fail State,0.600,2018-10-19,Fail State,0.0,1,fail state


In [30]:
#Remove duplicates:
bom_gross.drop_duplicates(subset='title_normalized', inplace=True)
tn_budgets.drop_duplicates(subset='title_normalized', inplace=True)
tmdb_movies.drop_duplicates(subset='title_normalized', inplace=True)

### Joins

In [31]:
#Join bom_gross and tmdb_movies
bom_tmdb_merged = pd.merge(bom_gross, tmdb_movies, on='title_normalized', how='inner', suffixes=('_bom', '_tmdb'))
print(f"bom_tmdb_merged shape: {bom_tmdb_merged.shape}")


bom_tmdb_merged shape: (1574, 15)


In [32]:
bom_tmdb_merged.isna().sum()

title_bom             0
studio                0
domestic_gross       17
foreign_gross         0
year                  0
title_normalized      0
genre_ids             0
id                    0
original_language     0
original_title        0
popularity            0
release_date          0
title_tmdb            0
vote_average          0
vote_count            0
dtype: int64

In [33]:
#Inspect Missing domestic_gross Data
missing_domestic = bom_tmdb_merged[bom_tmdb_merged['domestic_gross'].isna()]
display(missing_domestic[['title_bom', 'domestic_gross', 'title_tmdb']])


Unnamed: 0,title_bom,domestic_gross,title_tmdb
176,It's a Wonderful Afterlife,,It's a Wonderful Afterlife
210,Celine: Through the Eyes of the World,,Celine: Through the Eyes of the World
386,Force,,Force
446,Empire of Silver,,Empire of Silver
602,The Tall Man,,The Tall Man
627,Dark Tide,,Dark Tide
633,The Green Wave,,The Green Wave
758,22 Bullets,,22 Bullets
786,Matru Ki Bijlee Ka Mandola,,Matru Ki Bijlee Ka Mandola
797,The Snitch Cartel,,The Snitch Cartel


In [34]:
#Drop  Missing domestic_gross
bom_tmdb_merged = bom_tmdb_merged[bom_tmdb_merged['domestic_gross'].notna()]
print(f"Dataset after dropping missing values: {bom_tmdb_merged.shape}")


Dataset after dropping missing values: (1557, 15)


In [35]:
bom_tmdb_merged.head()

Unnamed: 0,title_bom,studio,domestic_gross,foreign_gross,year,title_normalized,genre_ids,id,original_language,original_title,popularity,release_date,title_tmdb,vote_average,vote_count
0,Toy Story 3,BV,415000000.0,652000000.0,2010,toy story 3,"[16, 10751, 35]",10193,en,Toy Story 3,24.445,2010-06-17,Toy Story 3,7.7,8340
1,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000.0,2010,harry potter and the deathly hallows part 1,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
2,Inception,WB,292600000.0,535700000.0,2010,inception,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186
3,Shrek Forever After,P/DW,238700000.0,513900000.0,2010,shrek forever after,"[35, 12, 14, 16, 10751]",10192,en,Shrek Forever After,15.041,2010-05-16,Shrek Forever After,6.1,3843
4,The Twilight Saga: Eclipse,Sum.,300500000.0,398000000.0,2010,the twilight saga eclipse,"[12, 14, 18, 10749]",24021,en,The Twilight Saga: Eclipse,20.34,2010-06-23,The Twilight Saga: Eclipse,6.0,4909


In [36]:
#Join tn_budgets and tmdb_movies
budgets_tmdb_merged = pd.merge(tn_budgets, tmdb_movies, on='title_normalized', how='inner', suffixes=('_budget', '_tmdb'))
print(f"budgets_tmdb_merged shape: {budgets_tmdb_merged.shape}")


budgets_tmdb_merged shape: (2030, 16)


In [37]:
budgets_tmdb_merged.isna().sum()

id_budget              0
release_date_budget    0
movie                  0
production_budget      0
domestic_gross         0
worldwide_gross        0
title_normalized       0
genre_ids              0
id_tmdb                0
original_language      0
original_title         0
popularity             0
release_date_tmdb      0
title                  0
vote_average           0
vote_count             0
dtype: int64

In [38]:
budgets_tmdb_merged.isna().sum()

id_budget              0
release_date_budget    0
movie                  0
production_budget      0
domestic_gross         0
worldwide_gross        0
title_normalized       0
genre_ids              0
id_tmdb                0
original_language      0
original_title         0
popularity             0
release_date_tmdb      0
title                  0
vote_average           0
vote_count             0
dtype: int64

In [39]:
#Combine all three datasets
final_merged = pd.merge(bom_tmdb_merged, budgets_tmdb_merged, on='title_normalized', how='inner')
print(f"final_merged shape: {final_merged.shape}")


final_merged shape: (1104, 30)


In [40]:
print(final_merged.columns)


Index(['title_bom', 'studio', 'domestic_gross_x', 'foreign_gross', 'year',
       'title_normalized', 'genre_ids_x', 'id', 'original_language_x',
       'original_title_x', 'popularity_x', 'release_date', 'title_tmdb',
       'vote_average_x', 'vote_count_x', 'id_budget', 'release_date_budget',
       'movie', 'production_budget', 'domestic_gross_y', 'worldwide_gross',
       'genre_ids_y', 'id_tmdb', 'original_language_y', 'original_title_y',
       'popularity_y', 'release_date_tmdb', 'title', 'vote_average_y',
       'vote_count_y'],
      dtype='object')


In [41]:
# Identify duplicates and keep unique columns
final_merged = final_merged.loc[:, ~final_merged.columns.duplicated()]

In [42]:
print(final_merged.head())


                    title_bom studio  domestic_gross_x  foreign_gross  year  \
0                 Toy Story 3     BV       415000000.0    652000000.0  2010   
1                   Inception     WB       292600000.0    535700000.0  2010   
2         Shrek Forever After   P/DW       238700000.0    513900000.0  2010   
3  The Twilight Saga: Eclipse   Sum.       300500000.0    398000000.0  2010   
4                  Iron Man 2   Par.       312400000.0    311500000.0  2010   

            title_normalized              genre_ids_x     id  \
0                toy story 3          [16, 10751, 35]  10193   
1                  inception            [28, 878, 12]  27205   
2        shrek forever after  [35, 12, 14, 16, 10751]  10192   
3  the twilight saga eclipse      [12, 14, 18, 10749]  24021   
4                 iron man 2            [12, 28, 878]  10138   

  original_language_x            original_title_x  ...  worldwide_gross  \
0                  en                 Toy Story 3  ...     1.0688

In [48]:
# Dropping duplicate columns
columns_to_drop = ['domestic_gross_y','worldwide_gross','title', 'genre_ids_y', 'original_language_y', 'original_title_y','title_bom','genre_ids_x', 
                   'popularity_y', 'release_date_tmdb', 'vote_average_y', 'vote_count_y','id_tmdb','id_budget','title_tmdb','id','movie']
cleaned_df = final_merged.drop(columns=columns_to_drop)

# Renaming columns to have cleaner names if necessary
cleaned_df.rename(columns={
    'domestic_gross_x': 'domestic_gross',
    'original_language_x': 'original_language',
    'original_title_x': 'original_title',
    'popularity_x': 'popularity',
    'release_date': 'release_date_final',
    'vote_average_x': 'vote_average',
    'vote_count_x': 'vote_count',
    'title_normalized':'movie'

}, inplace=True)

# Display cleaned DataFrame
display(cleaned_df.head())


Unnamed: 0,studio,domestic_gross,foreign_gross,year,movie,original_language,original_title,popularity,release_date_final,vote_average,vote_count,release_date_budget,production_budget
0,BV,415000000.0,652000000.0,2010,toy story 3,en,Toy Story 3,24.445,2010-06-17,7.7,8340,2010-06-18,200000000.0
1,WB,292600000.0,535700000.0,2010,inception,en,Inception,27.92,2010-07-16,8.3,22186,2010-07-16,160000000.0
2,P/DW,238700000.0,513900000.0,2010,shrek forever after,en,Shrek Forever After,15.041,2010-05-16,6.1,3843,2010-05-21,165000000.0
3,Sum.,300500000.0,398000000.0,2010,the twilight saga eclipse,en,The Twilight Saga: Eclipse,20.34,2010-06-23,6.0,4909,2010-06-30,68000000.0
4,Par.,312400000.0,311500000.0,2010,iron man 2,en,Iron Man 2,28.515,2010-05-07,6.8,12368,2010-05-07,170000000.0


In [None]:
# Save the cleaned DataFrame to a CSV file
cleaned_df.to_csv('cleaned_data.csv', index=False)

print("The cleaned DataFrame has been saved as 'cleaned_data.csv'.")


In [50]:
# Recalculate worldwide gross
cleaned_df['worldwide_gross'] = cleaned_df['domestic_gross'] + cleaned_df['foreign_gross']

# Add a profit margin column
cleaned_df['profit_margin'] = ((cleaned_df['worldwide_gross'] - cleaned_df['production_budget']) 
                               / cleaned_df['production_budget']) * 100

# Display the updated DataFrame
display(cleaned_df.head())


Unnamed: 0,studio,domestic_gross,foreign_gross,year,movie,original_language,original_title,popularity,release_date_final,vote_average,vote_count,release_date_budget,production_budget,worldwide_gross,profit_margin
0,BV,415000000.0,652000000.0,2010,toy story 3,en,Toy Story 3,24.445,2010-06-17,7.7,8340,2010-06-18,200000000.0,1067000000.0,433.5
1,WB,292600000.0,535700000.0,2010,inception,en,Inception,27.92,2010-07-16,8.3,22186,2010-07-16,160000000.0,828300000.0,417.6875
2,P/DW,238700000.0,513900000.0,2010,shrek forever after,en,Shrek Forever After,15.041,2010-05-16,6.1,3843,2010-05-21,165000000.0,752600000.0,356.121212
3,Sum.,300500000.0,398000000.0,2010,the twilight saga eclipse,en,The Twilight Saga: Eclipse,20.34,2010-06-23,6.0,4909,2010-06-30,68000000.0,698500000.0,927.205882
4,Par.,312400000.0,311500000.0,2010,iron man 2,en,Iron Man 2,28.515,2010-05-07,6.8,12368,2010-05-07,170000000.0,623900000.0,267.0


In [None]:
# Save the cleaned DataFrame to a CSV file
cleaned_df.to_csv('cleaned_movie_data.csv', index=False)

print("The cleaned DataFrame has been saved as 'cleaned_data.csv'.")
