In [None]:
# Import The Relevant Libraries 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#Phase 1 Project Description
##Project Overview
For this project, you will use exploratory data analysis to generate insights for a business stakeholder.

##Business Problem
Microsoft sees all the big companies creating original video content and they want to get in on the fun. They have decided to create a new movie studio, but they don’t know anything about creating movies. You are charged with exploring what types of films are currently doing the best at the box office. You must then translate those findings into actionable insights that the head of Microsoft's new movie studio can use to help decide what type of films to create.

##The Data
(data/bom.movie_gross.csv) 
(data/rt.movie_info.tsv)
(data/rt.reviews.tsv) 
(data/tmdb.movies.csv)
(data/tn.movie_budgets.csv)

I opted to use (data/bom.movie_gross.csv),   and (data/tn.movie_budgets.csv)
Because it was collected from various locations, the different files have different formats. Some are compressed CSV (comma-separated values) or TSV (tab-separated values) files that can be opened using spreadsheet software or pd.read_csv, while the data from IMDB is located in a SQLite database.

In [2]:
# Load the datasets ('bom.movie_gross.csv')
bom_movie_gross_df= pd.read_csv('data/bom.movie_gross.csv')
#Display the first few rows of the DataFrame to understand its structure
bom_movie_gross_df.head()


Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [3]:
# Load the datasets ('tn.movie_budgets')
tn_movie_budgets_df = pd.read_csv('data/tn.movie_budgets.csv')
#Display the first few rows of the DataFrame to understand its structure
tn_movie_budgets_df.head()


Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


#Step 2: Clean the Data

In [None]:
#We will clean the column names for consistency and convert appropriate columns to numeric types.
# Clean column names for consistency
bom_movie_gross_df.columns = [col.lower().replace(' ', '_') for col in bom_movie_gross_df.columns]
tn_movie_budgets_df.columns = [col.lower().replace(' ', '_') for col in tn_movie_budgets_df.columns]

In [None]:

bom_movie_gross_df['domestic_gross'] = pd.to_numeric(bom_movie_gross_df['domestic_gross'], errors='coerce')
bom_movie_gross_df['foreign_gross'] = pd.to_numeric(bom_movie_gross_df['foreign_gross'], errors='coerce')


In [None]:
# Convert columns to numeric types
tn_movie_budgets_df['production_budget'] = tn_movie_budgets_df['production_budget'].replace('[\$,]', '', regex=True).astype(float)
tn_movie_budgets_df['domestic_gross'] = tn_movie_budgets_df['domestic_gross'].replace('[\$,]', '', regex=True).astype(float)
tn_movie_budgets_df['worldwide_gross'] = tn_movie_budgets_df['worldwide_gross'].replace('[\$,]', '', regex=True).astype(float)
