# Project 1 - Group 6
###### <font color='gray'>Authors: Aisha Baitemoriva-Othman, Adonis McQueen, Angela Kim, Deja Prade, James Kang</font>

In [None]:
## Import libraries ##
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
## Import datasets ##

# Aisha's datasets: Rotten Tomatoes
df_rt_movie_info = pd.read_csv('data/rt.movie_info.tsv.gz',sep='\t')
df_rt_reviews = pd.read_csv('data/rt.reviews.tsv.gz',sep='\t',encoding='latin1')

# Adonis's dataset: Box Office Mojo
df_gross = pd.read_csv("data/bom.movie_gross.csv.gz")

# Angela's dataset: The Numbers
mb = pd.read_csv('data/tn.movie_budgets.csv.gz')

# Deja's dataset: TMDB
movie_type = pd.read_csv('data/tmdb.movies.csv.gz')

# James's datasets: IMDB
title_ratings = pd.read_csv('data/imdb.title.ratings.csv.gz')
title_basics = pd.read_csv('data/imdb.title.basics.csv.gz')
title_akas = pd.read_csv('data/imdb.title.akas.csv.gz')

## Questions
<ul>
</ul>
<li>Which movies have the highest gross values?
<li>What are the domestic and worldwide profits?
<li>What is the trend of movie genres over time?
<li>How has average movie runtime changed over time?

----

## Angela's Data Analysis:

In [None]:
## Preliminary analysis
mb.head(10)

In [None]:
## Check datatypes and look for any missing values
mb.info()

**Observations and Insights:**
<ul>
<li>There are 5782 total entries with 0 missing values.
<li>The release date column is the object datatype and should be changed to the datetime datatype.
<li>The production budget, domestic gross, and worldwide gross columns are also object datatypes and should be changed to integer.
</ul>

In [None]:
## Change release_date to datetime
mb['release_date'] = pd.to_datetime(mb['release_date'])

## Change production_budget, domestic_gross, and worldwide_gross to integer

mb['production_budget'] = mb['production_budget'].str.replace( ',' , '')
mb['production_budget'] = mb['production_budget'].str.replace( '$' , '')
mb['production_budget'] = pd.to_numeric(mb['production_budget'])

mb['domestic_gross'] = mb['domestic_gross'].str.replace( ',' , '')
mb['domestic_gross'] = mb['domestic_gross'].str.replace( '$' , '')
mb['domestic_gross'] = pd.to_numeric(mb['domestic_gross'])

mb['worldwide_gross'] = mb['worldwide_gross'].str.replace( ',' , '')
mb['worldwide_gross'] = mb['worldwide_gross'].str.replace( '$' , '')
mb['worldwide_gross'] = pd.to_numeric(mb['worldwide_gross'])


## Check that all changes have been correctly made
mb.info()

In [None]:
## Remove rows with release_date before 2010 to focus on analyzing data from the most recent decade

mb = mb[(mb['release_date']) >= '2010-01-01']

mb.head(30)

In [None]:
mb.tail(30)

**Observations & Insights:**
<ul>
<li>There are a lot of gross values that are 0. They also tend to correlate with movies with very small production budgets.
<li>Upon investigation, these films are usually independently made and don't apply to a large corporation like Microsoft.
</ul>

In [None]:
## Create two new columns for domestic profit and worldwide profit
## Remove rows with negative profits

mb['domestic_profit'] = (mb['domestic_gross'] - mb['production_budget'])
mb['worldwide_profit'] = (mb['worldwide_gross'] - mb['production_budget'])
mb.head(30)

**Observations & Insights:**
<ul>
<li>There are films that are box office bombs such as Dark Phoenix, The Lone Ranger, and Pirates of the Caribbean: On Strangers Tides. Their production budgets overshadowed their box office revenues.
</ul>

In [None]:
## Sort by highest domestic profits

mb_dp = mb.sort_values(by=['domestic_profit'], ascending=False)
mb_dp.reset_index(inplace=True)

## Sort by highest worldwide profits

mb_wp = mb.sort_values(by=['worldwide_profit'], ascending=False)
mb_wp.reset_index(inplace=True)

In [None]:
## Bar chart of Top 20 Movies with Highest Domestic Profits

fig, ax = plt.subplots(figsize=(12,7))

x1 = mb_dp['movie'][0:20]
y1 = mb_dp['domestic_profit'][0:20]

ax = sns.barplot(x=y1, y=x1, palette='flare_r')
ax.set(xlabel='Domestic Profits (in millions)', ylabel='Movies', title='Top 20 Movies with Highest Domestic Profits');

In [None]:
## Bar chart of Top 20 Movies with Highest Worldwide Profits

fig, ax = plt.subplots(figsize=(12,7))

x2 = mb_wp['movie'][0:20]
y2 = mb_wp['worldwide_profit'][0:20]

ax = sns.barplot(x=y2, y=x2, palette='flare_r')
ax.set(xlabel='Worldwide Profits (in billions)', ylabel='Movies', title='Top 20 Movies with Highest Worldwide Profits');

**Final Observations & Insights:**
<ul>
    <li>The 10 movies with highest domestic profits from 2010-mid2019 are <b><i>Star Wars Ep. VII: The Force Awakens, Black Panther, Jurassic World, Incredibles 2, The Avengers, Avengers: Infinity War, Beauty and the Beast, Rogue One: A Star Wars Story, The Hunger Games, and Jumanji: Welcome to the Jungle.</i></b>
    <li>The 10 movies with highest worldwide profits from 2010-mid2019 are <b><i>Avengers: Infinity War, Star Wars Ep. VII: The Force Awakens, Jurassic World, Furious 7, The Avengers, Harry Potter and the Deathly Hallows: Part II, Black Panther, Jurassic World: Fallen Kingdom, Frozen, and Beauty and the Beast.</i></b>
    <li>The most frequently occuring genres of the top 20 films are <b><i>Sci-fi, Action, Adventure, Animation, and Superhero.</i></b>
    <li>The majority of these films are part of the <b><i>Marvel</i></b> or <b><i>Star Wars</i></b> franchises.
    <li>There has also been a huge focus on revitalizing or reimagining classics such as <b><i>Jurassic Park, James Bond, and Jumanji.</i></b>
    <li>Many financially successful films are sequels such as <b><i>Star Wars Ep. VII, Avengers: Infinity War, Furious 7, Harry Potter and the Deathly Hallows: Parts I & II, and Incredibles 2</b></i>
</ul>

----

## Aisha's Data Analysis:

In [None]:
df_rt_movie_info.head(3)

In [None]:
df_rt_movie_info.shape

In [None]:
# Remove the month and day from the theater_date and create a new column with just the year of the release date
df_rt_movie_info['new_theater_date'] = df_rt_movie_info.theater_date.str[-4:]

In [None]:
# Convert the values in the 'new_theater_date' column from strings to integers
df_rt_movie_info['new_theater_date'] = df_rt_movie_info['new_theater_date'].apply(pd.to_numeric)

In [None]:
df_rt_movie_info.head(3)

**Observations & Insights**
<ul>
</ul>
<li>There are 494 out of 1560 rows that have a missing studio value

In [None]:
df_rt_movie_info['studio'].isna().value_counts()

In [None]:
# All the studios listed
df_rt_movie_info['studio'].value_counts().head() 

In [None]:
df_rt_reviews.head(10)

In [None]:
df_rt_reviews[df_rt_reviews['rating'].isna()]

In [None]:
# Split the "rating" column into two separate columns, separating them on "/" if the rating is not a letter
df_rt_reviews[['fn','sn']] = df_rt_reviews['rating'].str.split('/',expand=True)

In [None]:
df_rt_reviews.head()

In [None]:
# Convert all string values in the 'fn' and 'sn' columns into numeric values
df_rt_reviews['fn'] = df_rt_reviews['fn'].apply(pd.to_numeric)
df_rt_reviews['sn'] = df_rt_reviews['sn'].apply(pd.to_numeric)

In [None]:
df_rt_reviews['new_rating'] = df_rt_reviews.apply(lambda row: row.fn/row.sn, axis=1)

In [None]:
# Delete rows that have a new_rating value higher than 1
df_rt_reviews = df_rt_reviews[df_rt_reviews.new_rating <=1]

In [None]:
df_rt_reviews.head()

In [None]:
# Create a new dataset with only 'id' and 'new_rating' columns of the 'df_rt_reviews' dataset
# so we can merge it later on with the 'df_rt_movie_info' dataset.
new_rt_reviews = pd.DataFrame(zip(df_rt_reviews.id, df_rt_reviews.new_rating))

In [None]:
# Renam the columns back to original names
new_rt_reviews.rename(columns={0: 'id', 1: 'rating'}, inplace=True)

In [None]:
new_rt_reviews.head()

In [None]:
df_rt_movie_info.head()

In [None]:
new_rt_movie_info= pd.DataFrame(zip(df_rt_movie_info.id, df_rt_movie_info.genre,df_rt_movie_info.runtime))

In [None]:
new_rt_movie_info.head()

In [None]:
# Renam the columns back to original names
new_rt_movie_info.rename(columns={0: 'id', 1: 'genres',2: 'runtime'}, inplace=True)

In [None]:
new_rt_movie_info.head()

In [None]:
# Convert the 'runtime' column's string values into floats
merged_datasets['runtime'] = pd.to_numeric(merged_datasets['runtime'])

In [None]:
merged_datasets.info()

In [None]:
# Value count of movies by genre
merged_datasets['genres'].value_counts()

**Observations & Insights:**
<ul>
</ul>
<li>The last four genre categories are too low in numbers compared to other movie genres.
<li>We think it is best to remove them from our dataset.

In [None]:
# Create a list of genres that are not statistically significant
not_significant_genres_list = ['Anime and Manga','Gay and Lesbian','Cult Movies','Faith and Spirituality','Television','Sports and Fitness','Documentary','Western','Special Interest']

In [None]:
# Delete the not statistically insignificant genres from the dataset and creating a new dataset without those genres
new_merged_datasets = merged_datasets[~merged_datasets.genres.isin(not_significant_genres_list)]

In [None]:
# Find the mean of runtime for each genre
average_runtime_per_genre = new_merged_datasets.groupby('genres', as_index=False)['runtime'].mean()
average_runtime_per_genre

In [None]:
# Find the mean of the ratings for each genre
average_rating_per_genre = new_merged_datasets.groupby('genres', as_index=False)['rating'].mean()
average_rating_per_genre

In [None]:
fig, ax11 = plt.subplots(figsize = (20,8))
x11 = average_rating_per_genre['genres']
y11 = average_rating_per_genre['mean_rating']
ax11.bar(x1,y1)
ax11.set_title('average rating per genre')
ax11.set_xlabel('genres')
ax11.set_ylabel('average rating')
plt.xticks(rotation = 45)

**Observations & Insights:**
<ul>
</ul>
The 4 genres that have the highest average ratings:
        <li> Romance </li>
        <li> Art House and International </li>
        <li> Drama </li>
        <li> Mystery and Suspense </li>

In [None]:
# Create a list of the four genres with the highest rating
four_genres_list = ['Romance','Art House and International','Drama','Mystery and Suspense']

In [None]:
# Create a dataset with only the four genres with the highest ratings
highest_rating_genres_dataset = merged_datasets[merged_datasets.genres.isin(four_genres_list)]

In [None]:
highest_rating_genres_dataset['genres'].value_counts()

In [None]:
# The mean of runtime for the four highest rated genres
highest_rating_genres_dataset['runtime'].mean()

In [None]:
# Standard deviation from the mean for the four highest rated genres
highest_rating_genres_dataset['runtime'].std()

In [None]:
# Get an average runtime for each of the four highest rating genres
avg_runtime_high_rating_genres = highest_rating_genres_dataset.groupby('genres', as_index=False)['runtime'].mean()

In [None]:
# Rename the 'runtime' column of the new dataset into the "mean_runtime" 
avg_runtime_high_rating_genres.rename(columns={'runtime': 'mean_runtime'}, inplace=True)

In [None]:
avg_runtime_high_rating_genres

In [None]:
fig, ax22 = plt.subplots(figsize = (12,6))
x22 = avg_runtime_high_rating_genres['genres']
y22 = avg_runtime_high_rating_genres['mean_runtime']
ax22.bar(x2,y2)
ax22.set_title('average runtime per genre')
ax22.set_xlabel('genres')
ax22.set_ylabel('average rating')
plt.xticks(rotation = 45);

----

## Adonis's Data Analysis

In [None]:
df_gross.head()

In [None]:
# Find null values
df_gross['domestic_gross'].isnull().value_counts()

In [None]:
df_gross['foreign_gross'].isnull().value_counts()

In [None]:
df_gross.info()

In [None]:
# Remove commas from values & convert to numeric
df_gross['foreign_gross'] = df_gross['foreign_gross'].str.replace(',', '')
df_gross['foreign_gross'] = pd.to_numeric(df_gross['foreign_gross'])
df_gross['foreign_gross'].head() 

In [None]:
# Remove null values from columns
df_gross.dropna(subset=['studio', 'domestic_gross', 'foreign_gross'], inplace=True)
df_gross.shape

In [None]:
# Verification of null value deletion
df_gross.info()

**Observations & Insights:**
<ul></ul>
<li>Do the studios that produce the most movies have the highest gross values?

In [None]:
# Top 10 studios with the most movies
df_gross['studio'].value_counts()[:10]

In [None]:
x = list(df_gross['studio'].value_counts()[:10].index)
x

In [None]:
# Top 10 studios domestic gross
dfm2 = df_gross.groupby(['studio']).domestic_gross.sum().reset_index().sort_values(by='domestic_gross', ascending=False)
d = dfm2[dfm2['studio'].isin(x)]

**Observations & Insights:**
<ul></ul>
<li>Data analysis will focus on domestic and foreign gross for titles and studios.

In [None]:
# Total gross for each film
df_gross['total_gross'] = df_gross['domestic_gross'] + df_gross['foreign_gross']
df_gross.head()

In [None]:
# Sort by domestic gross
f = df_gross.sort_values(by='domestic_gross', ascending=False)

f.head()

**Observations & Insights:**
<ul></ul>
<li>Some foreign gross values seem wrong for blockbuster films. Are there more?

In [None]:
# Sort df on foreign gross
g = df_gross.sort_values(by='foreign_gross', ascending=False)
g.head(20)

In [None]:
# Check for more misreported foreign gross
g.tail(20)

**Observations & Insights:**
<ul></ul>
<li>Foreign gross values for some blockbusters are obviously misreported.
<li>Analysis will focus on domestic markets, as foreign gross values cannot be verified/trusted.

In [None]:
# Domestic gross per title
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(20,8))

x1 = f['title'][:10]
y1 = f['domestic_gross'][:10]
ax1.barh(x1, y1)
ax1.set_xlabel("Domestic Gross (hundreds of millions)")
ax1.set_ylabel("Movie Title")
ax1.set_title("Domestic Gross per Movie")


# Domestic gross per studio
dfm3 = df_gross.groupby(['studio']).domestic_gross.sum().reset_index().sort_values(by='domestic_gross', ascending=False)


x2 = dfm3.studio[:10]
y2 = dfm3.domestic_gross[:10]
ax2.barh(x2, y2)
ax2.set_xlabel('Domestic Gross (tens of billions)')
ax2.set_ylabel('Studio')
ax2.set_title('Top 10 Highest Domestic Grossing Studios ')

# Domestic gross for studios with most films
x = ['Uni.', 'Fox', 'WB', 'Sony', 'BV', 'Par.', 'LGF', 'Wein.', 'IFC', 'SPC']


x3 = d['studio']
y3 = d['domestic_gross']
ax3.barh(x3, y3)
ax3.set_xlabel("Domestic Gross (tens of billions)")
ax3.set_ylabel("Studio")
ax3.set_title('Total Domestic Gross for Studio with Most Films')

fig.tight_layout()

**Final Observations & Insights**
<ul>
    <li> From the data we can see that outside of the largest studios (BV, Uni, WB, Fox, Sony), the number of films doesn't correlate with highest domestic gross. As such, we recommend that Microsoft focuses on quality content in Drama, Action and Comedy.
    <li> The highest grossing films are all franchises (Marvel, Jurassic Park, Star Wars, etc). It is highly recommended that any action films made be based upon Microsoft franchises to avoid licensure/copyright fees(this caused issues for Microsoft in the past with potential Halo films).
    <li> Smaller studios such as Weinstein, SPC and IFC could potentially be acquired to acclerate content production.
</ul>

----

# Deja's Data Analysis

In [None]:
# Display dataframe
display(movie_type)
display(movie_type.info())

**Observations & Insights:**
<ul></ul>
<li>There are 26517 total entries with 0 missing values.
<li>The vote count needs to be reduced to a more relevant number and discard outliers like 1
<li>The genre ids need to be changed to actual genres
<li>Reduce to movies in english 
<li>Reduce number of duplicate titles

In [None]:
# Show when movies were released
movie_type.sort_values(by='release_date', ascending=False)

In [None]:
# Filter for movies in English
movie_type= movie_type.loc[movie_type['original_language'] == "en"]
movie_type

In [None]:
# Condense data to get most voted on movies
movie_type_data= movie_type.loc[movie_type['vote_count'] >= 10000]
movie_type_data

In [None]:
movie_type_data["genre_ids"].iloc[0]

In [None]:
# Separate genre ids with multiple values
movie_type_data.genre_ids = movie_type_data.genre_ids.map(lambda x:eval(x))

In [None]:
movie_type_data = movie_type_data.explode("genre_ids")

In [None]:
# Filter genres with few votes
movie_type_data= movie_type_data.loc[movie_type_data['genre_ids'] <= 878]
movie_type_data

In [None]:
movie_type_data["genre_ids"].replace({12: "adventure", 14: "fantasy", 16: "animated", 18: "drama", 27: "horror", 28: "action", 35: "comedy", 36: "history", 37: "western", 53: "thriller", 80: "crime", 878: "sci fi"}, inplace=True)

In [None]:
#Create a violin plot to display the relationship between vote averages and genres
sns.set_theme(style="ticks", color_codes=True)
g = sns.catplot(x=movie_type_data["genre_ids"].astype("category") , y="vote_average", kind="violin", inner=None, data=movie_type_data)
sns.swarmplot(x="genre_ids", y="vote_average", color="k", size=3, data=movie_type_data, ax=g.ax)
g.set_xticklabels(rotation=45)

## conclusion
I believe the focus of resources should be centered towards action, adventure, fantasy,sci-fi, comedy and animated movies. While animated has the highest rating the other 5 categories have higher volumes of consumer interaction.