# Imports

In [None]:
#basic libraries for linear algebra and data procesing
import numpy as np
import pandas as pd

#visualization tools
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

#wordcloud
from wordcloud import WordCloud, STOPWORDS
from collections import Counter

#model selection
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict

#time and warnings
import time
import warnings

#settings
warnings.filterwarnings("ignore")
%matplotlib inline

# Data

In [None]:
#loading data into a pandas dataframe
movies = pd.read_csv('../input/movie-data/movie_data.csv')

In [None]:
#inspecting the first five rows
movies.head()

# Basic Data Exploration and Preparation

- **Number of entries** : There are a total of 5043 entries. 
- **Number of features and their types**: There are a total of 26 features with types: float64(12), int64(3), object(11)
- **Scale**: Scales differ greatly, namely *gross* and *budget* in comparison to other features.
- **Missing values:** total of 2685 missing values, highest % of missing values are the target value '*gross*' 17.52%, followed by '*budget*' 9.75%.
- **Duplicate rows**: There are total of 247 duplicated rows.
- **Categorical features**: color, director_name, actor_1_name, actor_2_name, actor_3_name, genres, movie_title, plot_keywords, language, country, content_rating
- **Numerical features**: num_critic_reviews, duration, director_facebook_likes, actor_1_facebook_likes, actor_2_facebook_likes, actor_3_facebook_likes, gross, num_users_voted, cast_total_facebook_likes, num_user_reviews, budget, title_year, movie_score, aspect_ratio, movie_facebook_likes

In [None]:
#fetching basic information on the dataset
movies.info()

**Missing values**

- There are total of 2685 missing values, highest % of missing values are the target value '*gross*' 17.52%, followed by '*budget*' 9.75%.
- The heatmap visualizes the nullity correlation between the columns in the dataset. We can clearly see that the heatmap shows no negative correlation between features, which means that if the value of one variable is present then the value of other variables is definitely absent. There are few no correlation spaces which represent that variables values present or absent do not have any effect on one another. Majority of correlations are in fact positive, and that indicates that if the value of one variable is present then the value of the other is definitely present. Positive correlations are highest among director_name, title_year, actor_2_name, actor_3_facebook_likes etc. Budget and gross show a low to medium positive correlation among all columns.

**Action taken**

As the number of missing values in the target value, as well as a potentially highly correlated independent variable budget, is high, and is present accross all columns, we will remove all missing features from the dataset. The time consumption of entering missing data exceeds this projects goals.

In [None]:
#exploring the number of missing values per feature in percentage
print('Number of missing values: ', movies.isnull().values.sum())
print('Percent of missing values per feature: ') 
movies.isnull().sum() * 100 / len(movies)

In [None]:
#matrix map of missing values
msno.matrix(movies)

In [None]:
#heatmap of missing values
msno.heatmap(movies)

In [None]:
#removing missing values
movies.dropna(inplace=True)

In [None]:
#resetting the index
movies.reset_index(drop=True, inplace=True)
movies.head()

In [None]:
print('Number of missing values: ', movies.isnull().values.sum())
print('Data lenght: ', len(movies))

**Duplicate values**

There are total of 247 duplicated rows.

**Action taken**

Duplicate rows will be removed on the movie_title column.

In [None]:
#removing duplicate rows based on the 'movie_title' column
movies.drop_duplicates(subset=['movie_title'], keep=False, inplace=True)

In [None]:
print('Data length: ', len(movies))

In [None]:
#resetting the index
movies.reset_index(drop=True, inplace=True)
movies.head()

**Exploring numerical features**

In [None]:
#separating numerical features from the categorical
num_movies = movies.select_dtypes(include=['float64', 'int64'])

In [None]:
%%time

#visualizing numerical features using pairplot
sns.pairplot(num_movies)

**Converting budget and gross to a more interpretable value expressed in millions**

As gross and budget are both expressed in 6 and more digits, for the sake of interpretability and visualization, we will convert the values to million, and create two new features.

In [None]:
#converting budget to million
movies['budget ($) millions'] = (movies['budget'].astype(float)/1000000).round(2)

#converting gross to million
movies['gross ($) millions'] = (movies['gross'].astype(float)/1000000).round(2)
movies.head(1)

In [None]:
#converting gross to million
movies['num_voted_mill'] = (movies['num_users_voted'].astype(float)/1000000).round(2)
movies.head(1)

**Identifying anomalies**

An obvious outlier in the budget feature was detected, which after further inspection was identified as false.

**Action taken** : The false value was replaced with the appropriate value taken from the IMDB website.

------------------------------------------------------------------------------------------------------------------

Initial exploration identified huge outliers in the budget value, which was identified as a currency from the country of origin. All USA movies appear to have the same curency: US dollar. All non-USA English speaking countries (New Zealand, UK, Australia etc.) appear to have their budgets expressed in US dollars.

**Action taken** : removed all foreign languages, and kept the English language.

In [None]:
#zooming in on a potential anomaly that was detected in the previous plot concerning the budget feature 
f, ax = plt.subplots(figsize=(12,7))
sns.distplot(movies['budget ($) millions'],ax=ax)

In [None]:
#locating the incorrect value
movies[movies['budget'] == 390000000.0]

In [None]:
#removing the invalid budgetvalue and replacing it with the proper one
movies['budget'] = movies['budget'].replace(390000000.0, 60000000.0)

In [None]:
# updating the budget in millions column
movies['budget ($) millions'] = (movies['budget'].astype(float)/1000000).round(2)

In [None]:
#exploring outliers and identifying if the currency between gross and budget matches
budget_over_250 = movies[movies['budget ($) millions'] > 250]
budget_over_250[['country', 
                 'language',
                 'movie_title', 
                 'budget ($) millions', 
                 'gross ($) millions']].sort_values('budget ($) millions', ascending=False)

In [None]:
#keeping only the English speaking movies
movies = movies[movies.language == 'English']

In [None]:
#resetting the index
movies.reset_index(drop=True, inplace=True)
movies.head()

In [None]:
movies['language'].value_counts()

**Exploring categorical features**

In [None]:
movies['genres'].value_counts()

In [None]:
movies['plot_keywords'].value_counts()

In [None]:
movies['country'].value_counts()

In [None]:
movies['color'].value_counts()

In [None]:
movies['language'].value_counts()

In [None]:
movies['content_rating'].value_counts()

# Feature engineering

**Profit**

For the sake of exploration and clarity, two new features will be added to the dataset: profit and roi.

Profit is calculated by subtracting the production cost (in this case budget) by the total amount earned (gross).

In [None]:
movies['profit'] = movies['gross ($) millions'] - movies['budget ($) millions']

**ROI**

ROI is calculated by subtracting the budget by net profit, and multiplying it by 100. The resulting amount is in %.

In [None]:
movies['roi'] = (movies['profit']/movies['budget ($) millions'] * 100).round(2)

**Profit Margin**

- The profit margin measures what percentage of revenue (in this case gross) a movie keeps after paying for all expenses. The profit margin lets us know how much profit a movie has generated for each dollar of sale. For example, a 40% profit margin means you have a net income of $0.40 for each dollar of sales. It shows the overall ability to turn income into profit.

- Formula for calculating use the profit margin is: Profit Margin = (Profit / Gross) X 100

In [None]:
movies['profit_margin'] = (movies['profit'] / movies['gross ($) millions']).round(2)

**VAR (Value Above Replacement)**

- Value Above Replacement (VAR) represents a value of number of times a figure (actor/actress or a director) has exceeded an average amount. In terms of profit, VAR measures the X times an actor/actress or a director appeared in movies with an above average profit.

- VAR is calculated by taking the number of times an actor/actress or a director appeared in a movie, dividing it by the net profit of movies they appeared in.

- For the sake of this project, we will consider both actors/actresses and directors, with a condition for actor having appeared in at least 5 movies, and directors directing at least 3 movies.

In [None]:
#calculating VAR for actors/actresses

actor1_counts = movies['actor_1_name'].value_counts()

#creating a list with a condition of having appeared in at least 5 movies
actor1_list = actor1_counts[actor1_counts >= 5].index.tolist()

#creating a dataframe with actors in 5 or more movies
actors1 = movies[movies['actor_1_name'].isin(actor1_list)]

#creating a dataframe with actors and the average profit
actor1_total = actors1.groupby(['actor_1_name'], as_index=False)['profit'].mean().sort_values(by='profit', ascending=False)

#creating the VAR value feature
actor1_total['VAR'] = (actor1_total['profit']/actor1_total['profit'].mean())

In [None]:
#calculating VAR for directors

director_counts = movies['director_name'].value_counts()

#creating a list with a condition of having appeared in at least 5 movies
director_list = director_counts[director_counts >= 5].index.tolist()

#creating a dataframe with actors in 5 or more movies
director = movies[movies['director_name'].isin(director_list)]

#creating a dataframe with actors and the average profit
director_total = director.groupby(['director_name'], as_index=False)['profit'].mean().sort_values(by='profit', ascending=False)

#creating the VAR value feature
director_total['VAR'] = (director_total['profit']/director_total['profit'].mean())

**Removing irrelevant features**

- *Color* - there is no evidence this feature has any relevance to the target feature.
- *Aspect Ratio* - there is no evidence this feature has any relevance to the target feature.
- *Actor 3* - I assume additional cast doesn't have any relevance for future recommendations, so I kept the main two actors, and removed the third one.
- *Actor 3 Facebook Likes* - the reasoning for the removal of this feature matches the one described for removing *Actor 3*

In [None]:
#removing irrelevant features
movies.drop(columns=['color', 'aspect_ratio', 'actor_3_name', 'actor_3_facebook_likes'], inplace=True)

In [None]:
#resetting the index
movies.reset_index(drop=True, inplace=True)
movies.head()

# **Exploratory Data Analysis**

Questions:

- What is the correlation between budget and profit? What would be an optimal budget value for making a Box Office success?
- Do actors/actresses and directors play a role in a movie's success?
- How does a movie's rating impact profit?
- Is duration of the movie relevant to profit?
- How do genres play in with profit and profit margin?
- How does the trend of profit, revenue, profit margin, and other attributes change over years?
- How do number of users voted, and number of critical reviews impact profit?
- Is there a pattern in common plot keywords with successful movies?

# ** FUN VISUALIZATIONS**

- 4 plots showing top 10 movies with the highest PROFIT, GROSS, BUDGET, ROI

In [None]:
#creating dataframes
top_10_profit = movies[['movie_title', 'profit']].sort_values(by = 'profit', ascending = False)[:10]
top_10_gross = movies[['movie_title', 'gross ($) millions']].sort_values(by='gross ($) millions',ascending=False)[:10]
top_10_budget = movies[['movie_title', 'budget ($) millions']].sort_values(by = 'budget ($) millions', ascending = False)[:10]
top_10_roi = movies[['movie_title', 'roi']].sort_values(by = 'roi', ascending = False)[:12]

#visualizing top 10 grossing, profitable, highest budget, highest roi movies

plt.rc('ytick', labelsize = 15) 
plt.rc('xtick', labelsize = 15)
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(ncols = 2,nrows = 2,figsize = (20, 18), constrained_layout = True)

sns.barplot(data = top_10_profit, x = 'profit', y = 'movie_title', palette = 'Accent', ax = ax1)
ax1.tick_params(axis= 'y', labelrotation = 45)
ax1.set_xlabel('Profit in USD Millions',fontsize = 18)
ax1.set_ylabel('movie title', fontsize = 1)

sns.barplot(data = top_10_gross, x = 'gross ($) millions', y = 'movie_title', palette = 'Dark2', ax = ax2)
ax2.tick_params(axis= 'y', labelrotation = 45)
ax2.set_xlabel('Gross in USD Millions',fontsize = 18)
ax2.set_ylabel('movie title', fontsize = 1)

sns.barplot(data = top_10_budget, x = 'budget ($) millions', y = 'movie_title', palette = 'tab20', ax = ax3)
ax3.tick_params(axis= 'y', labelrotation = 45)
ax3.set_xlabel('Budget in USD Millions',fontsize = 18)
ax3.set_ylabel('movie title', fontsize = 1)

sns.barplot(data = top_10_roi[-10:], x = 'roi', y = 'movie_title', palette = 'Dark2', ax = ax4)
ax4.tick_params(axis= 'y', labelrotation = 45)
ax4.set_xlabel('ROI in %',fontsize = 18);
ax4.set_ylabel('movie title', fontsize = 1);

# **What is the correlation between budget and profit? Which budget ranges should be considered for making a Box Office success?**

In order to answer this question we will need to examine several key things:

- What is the relationship between profit (profit margin) and budget?

Budget and profit show a clear positively linear relationship, which means that the higher the budget, the higher the profit of a movie.

The trend line in this plot of profit margin and budget is showing a negative trend, which can be explained by if we spend too much money on making a movie we can potentially descrease our profit margin.

- What is the average and median value of budget for the top 30 movies? and of total?
- What is the average and median value of profit margin for the top 30 movies? and of total?

Both averages and median values differ, so in order to adjust for obvious outliers such as Avatar, Jurassic World, and Titanic, we will consider the median values. 

**Conclusion**: The optimal budget value for making a successfull movie should not be less than 40 Million US dollars, and on average we recommend it to be somewhere around 75 Million US dollars. This recommendation was made based on a profit margin greater than 0.6. There is evidence that higher budget movies risk a smaller profit margin, as shown in the plot, therefore we cannot recommend very large budgets to be a certain indicator of a Box Office success.

**What is the relationship between profit (profit margin) and budget?**

- in order to analyse this relatonship we need to create a new dataframe containing movies with profit greater than 0

In [None]:
#creating a dataframe with profit > 0
movies_profitable = movies.loc[movies['profit'] > 0]
movies_profitable_sorted = movies_profitable.sort_values(by = 'profit', ascending = False)
movies_profitable_sorted.reset_index(inplace = True)
movies_profitable_sorted.head()

In [None]:
#visualizing the trend between profit vs budget

sns.lmplot(x = 'budget ($) millions', y = 'profit', data = movies_profitable_sorted, height = 10, aspect = 2)
plt.ylabel('Profit in ($) Millions', fontsize = 18)
plt.xlabel('Budget in ($) Millions', fontsize = 18)
plt.title('Relationship between Profit and Budget', fontsize = 20)
# plt.xticks(rotation = 'vertical')
plt.show();

In [None]:
#visualizing the trend between profit margin vs budget

sns.lmplot(x = 'budget ($) millions', y = 'profit_margin', data = movies_profitable_sorted, height = 10, aspect = 2)
plt.ylabel('Profit Margin', fontsize = 18)
plt.xlabel('Budget in ($) Millions', fontsize = 18)
plt.title('Relationship between Profit Margin and Budget', fontsize = 20)
# plt.xticks(rotation = 'vertical')
plt.show();

**What is the average and median value of budget for the top 30 movies? and of total?**

In [None]:
plt.figure(figsize=(15,12))

sns.barplot(x = movies_profitable_sorted.loc[0:30, 'profit'],
            y = movies_profitable_sorted.loc[0:30, 'movie_title'], 
            color = 'lawngreen', 
            label = 'Profit', 
            ci = None)

sns.barplot(x = movies_profitable_sorted.loc[0:30, 'budget ($) millions'],
            y = movies_profitable_sorted.loc[0:30, 'movie_title'], 
            color = 'firebrick', 
            label = 'Budget', 
            ci = None)

plt.xlabel('Profit in ($) Millions', fontsize = 16)
plt.title('Profit and Budget for the Top 30 Profitable Movies', fontsize = 16)
plt.ylabel('Movie Title', fontsize = 16)
plt.xticks(rotation='horizontal', fontsize = 12)
plt.legend(fontsize = 16);

In [None]:
print('Average Budget value of the Top 30 successful Movies - ', 
      round(movies_profitable_sorted.loc[0:29, 'budget ($) millions'].mean(), 2))
print('Median Budget value of the Top 30 successful Movies - ', 
      movies_profitable_sorted.loc[0:29, 'budget ($) millions'].median())
print('Average Budget value of all movies with Profit greater than 0 - ', 
      round(movies_profitable_sorted['budget ($) millions'].mean(), 2))
print('Median Budget value of all movies with Profit greater than 0 - ', 
      movies_profitable_sorted['budget ($) millions'].median())

**What is the average and median value of profit margin for the top 30 movies? and of total?**

In [None]:
print('Average Profit Margin of the Top 30 successful Movies - ', 
      round(movies_profitable_sorted.loc[0:29, 'profit_margin'].mean(), 2))
print('Median Profit Margin of the Top 30 successful Movies - ', 
      round(movies_profitable_sorted.loc[0:29, 'profit_margin'].median(), 2))
print('Average Profit Margin of all movies with Profit greater than 0 - ', 
      round(movies_profitable_sorted['profit_margin'].mean(), 2))
print('Median Profit Margin of all movies with Profit greater than 0 - ', 
      movies_profitable_sorted['profit_margin'].median())

In [None]:
#filtering out movies with profit margin > 0.6, top 30, median budget

margin_filter = movies[(movies['profit_margin'] > 0.6) & (movies['budget ($) millions'] > 23)]
margin_filter[['movie_title','profit_margin','budget ($) millions','profit']].describe()

# **Do actors/actresses and directors play a role in a movie's success?**

We will be working with actor1_total dataframe and director_total dataframe.

- It appears that movies where Robert Pattison stars, make almost 9 times the amount of the average movie. 
- Directors play a significant role, movies directed by George Lucas make little over 12 times the amount of the average movie. 
- Average mean VAR of actor/actress -  3.51
- Average mean VAR of directors -  3.36
- Average median VAR of actor/actress -  3.03
- Average median VAR of directors -  2.53

**Conclusion**: With great certainty we can recommend that the studio takes in to account the VAR score of a actor/actress, and even more the VAR score of the person who will direct the movie. 

- For actors/actresses we recommend a minimum VAR value between 1.0 and 3.0
- For directors we recommend a minimum VAR value between 1.0 and 2.53

In [None]:
#visualizing top 30 actors / actresses and their VAR value

plt.figure(figsize=(15,9))

actor1_top30 = actor1_total[:30]

sns.barplot(x = 'VAR',
            y = 'actor_1_name',
            data = actor1_top30,
            palette = 'Set2')

plt.title('VAR values for top 30 actors/actresses', fontsize = 16)
plt.xlabel('VAR', fontsize = 16)
plt.ylabel('Actor / Actress', fontsize = 16);

In [None]:
#visualizing top 30 directors and their VAR value

plt.figure(figsize=(15,9))

director_top30 = director_total[:30]

sns.barplot(x = 'VAR',
            y = 'director_name',
            data = director_top30,
            palette = 'Set3')

plt.title('VAR values for top 30 directors', fontsize = 16)
plt.xlabel('VAR', fontsize = 16)
plt.ylabel('Director', fontsize = 16);

In [None]:
print('Average mean VAR of actor/actress - ', round(actor1_top30['VAR'].mean(), 2))
print('Average mean VAR of directors - ', round(director_top30['VAR'].mean(), 2))

print('Average median VAR of actor/actress - ', round(actor1_top30['VAR'].median(), 2))
print('Average median VAR of directors - ', round(director_top30['VAR'].median(), 2))

# **How does a movie's score impact profit?**

- What is the distribution of ratings?

Normally distributed, slightly skewed to the right. Mean value is 6.42, and the median value is 6.5.

- Are ratings correlated with profit? 

Correlation with profit is positive but insignificant: 0.28, and correlation with profit_margin is close to zero: 0.03.

- What is the mean and median value of profit and profit margin for movies with ratings greater than 7?

We filtered out movies with profit greater than 0, and profit margin greater than 0.

- checking for an average IMDB rating director with at least 3 movies

**Conclusion:** We recommend taking into account the average movie score (not less than 7.0) of a director, when hiring one. We consider it will have a positive impact on profit. Another recommendation regarding movie scores will be in relation to movie's genre, and will be detailed later in the report.


In [None]:
movies['movie_score'].describe()

In [None]:
#visualizing the distribution of movie's scores

sns.displot(movies['movie_score'], kde = True, height = 6, aspect = 2, color = 'darkslateblue')
plt.title('Movie Scores Distribution', fontsize = 16)
plt.xlabel('Movie Score', fontsize = 16)
plt.ylabel('Count', fontsize = 16);

In [None]:
#showing correlation between movie score and profit

score_profit= movies[['movie_score', 'profit', 'profit_margin']]
score_profit_corr = score_profit.corr()
score_profit_corr

**What is the mean and median value of profit and profit margin for movies with ratings greater than 7?**

In [None]:
score_7plus = movies[(movies['movie_score'] >= 7) & (movies['profit'] >= 0) & (movies['profit_margin'] >= 0)]

In [None]:
score_7plus.describe()

In [None]:
#visualizing average profit and upper/lower values and movie score trends

plt.figure(figsize=(15,9))
sns.barplot(data = score_7plus, x = 'movie_score', y = 'profit', palette = 'Set3')
plt.title('Movie Scores Distribution with Profit', fontsize = 16)
plt.xlabel('Movie Score', fontsize = 16)
plt.ylabel('Profit in USD Million', fontsize = 16);

In [None]:
#visualizing average profit margin and upper/lower values and movie score trends

plt.figure(figsize=(15,9))
sns.barplot(data = score_7plus, x = 'movie_score', y = 'profit_margin', palette = 'Set2')
plt.title('Movie Scores Distribution with Profit Margin', fontsize = 16)
plt.xlabel('Movie Score', fontsize = 16)
plt.ylabel('Profit Margin', fontsize = 16);

In [None]:
#checking for an average IMDB rating director with at least 5 movies
director_rating = director.groupby('director_name')['movie_score'].mean().reset_index().sort_values(by = 'movie_score', ascending = False)

In [None]:
plt.figure(figsize=(15,9))
sns.barplot(data = director_rating[:20], x = 'movie_score', y = 'director_name', palette = 'bwr')
plt.title('Mean Movie Score per Director with more than 5 Movies ', fontsize = 16)
plt.xlabel('Movie Score', fontsize = 16)
plt.ylabel('Director Name', fontsize = 16);

# **How does the trend of profit, revenue, profit margin, and other attributes change over years?**

In [None]:
movies['title_year'].describe()

In [None]:
#visualizing budget, gross, profit and profit margin over time

fig,ax =  plt.subplots(2, 2,figsize = (15, 8))
sns.lineplot(x = "title_year", y = 'gross ($) millions', color = 'blue', data = movies, ax = ax[0][0])
sns.lineplot(x = "title_year", y = 'budget ($) millions', color = 'red', data = movies,ax = ax[0][1])
sns.lineplot(x = "title_year", y = 'profit', color = 'green', data = movies,ax = ax[1][0])
sns.lineplot(x = "title_year", y = 'profit_margin', color = 'yellow', data = movies, ax = ax[1][1]);

In [None]:
#visualizing budget and profit per year
plt.figure(figsize=(13,7))
lineplot_budget = sns.lineplot(data = movies, x = 'title_year', y = 'budget ($) millions', color = 'red', label = 'budget')
lineplot_profit = sns.lineplot(data = movies, x = 'title_year', y = 'profit', color = 'green', label = 'profit')
plt.xlabel("Release year",fontsize = 15)
plt.ylabel("Amount ($)",fontsize = 15)
plt.title("Comparison average movies Budget and Profit per year",fontsize = 17)
plt.legend(fontsize = 'x-large')
plt.xlim([1929, 2016])
plt.show();

something happened just before 1940's  let's see what had happened:

It appears that there were 3 movies at the time with a huge revenue:

- Snow White and the Seven Dwarfs 184.93
- Gone with the Wind 198.66
- Pinocchio 84.30

In [None]:
year_40s_anomaly = movies[(movies['title_year'] <= 1950) & (movies['title_year'] >= 1930)]

year_40s_anomaly[['movie_title', 
             'title_year', 
             'gross ($) millions', 
             'budget ($) millions', 
             'profit', 
             'profit_margin',
             'roi']].sort_values(by = 'title_year', ascending = False)

In [None]:
#just for fun, let's see the number of movies released per year
movie_count_per_year = movies.groupby('title_year')['movie_title'].count()
movie_count_per_year.plot(figsize = (13,5))
plt.title('Number of movies per year', fontsize = 17)
plt.xlabel('Year', fontsize = 15)
plt.ylabel('Movie count', fontsize = 15)
plt.xlim([1929, 2016])
plt.show();

In [None]:
sns.lineplot(x = "title_year", y = 'num_voted_mill', color = 'blue', data = movies)

# **How do genres play in with profit and profit margin?**

- Comedy, Action, Drama, and Adventure are the most common genre in the dataset.
- EXCLUDING GENRES that count less than 40 movies
- **Conclusion:** We recommend investing in the Animation genre, in the above mentioned budget range of 40 Million US Dollars to 75 Million US Dollars, as well as Family and Adventure genres, as they show a desirable ROI, and are not as expensive. They can be on the lower end of the budget recommendations.

In [None]:
movies['genres']

In [None]:
#exploding genres into a separate dataframe
movies_df1 = pd.DataFrame(movies.genres.str.split('|').to_list())

#merging the genres dataframe with the original one
movies_df2 = pd.merge(movies, movies_df1, right_index = True, left_index = True)
movies_df3 = movies_df2.drop(['genres'], axis = 1)

#creating a new dataframe with first three genres taken into account
genre_movies_df = movies_df3.melt(id_vars=['movie_title', 'title_year'], value_vars=[0, 1, 2] ,var_name = ['X'])

#merging these two dataframes and removing duplicates and missing values
genre_movies_df = pd.merge(genre_movies_df, movies)
genre_movies_df = genre_movies_df.drop(['genres', 'X'], axis=1)
genre_movies_df = genre_movies_df.drop_duplicates()
genre_movies_df = genre_movies_df.rename(columns={'value': 'genres'})
genre_movies_df = genre_movies_df.dropna()

In [None]:
#counting movies per genre
genres_count = genre_movies_df['genres'].value_counts()
genres_count = genres_count.reset_index().rename(columns={'index': 'genres', 'genres': 'count'})

#plotting genres count
plt.figure(figsize = (13,5))
sns.barplot(data = genres_count, x = 'count', y = 'genres', palette = 'tab20b')
plt.title('Number of movies per genre', fontsize = 17)
plt.xlabel('Movie Count', fontsize = 15)
plt.ylabel('Genre', fontsize = 15);

In [None]:
#creating a list of genres that have too small sample size, and will not be taken into account when performing analysis
valid_sample_genre_list = list(genres_count['genres'][-7:].values)

#setting genre to be index
genre_movies_test = genre_movies_df.set_index('genres')

#dropping genres that have little to no sample size for further analysis
genre_movies_test = genre_movies_test.drop(valid_sample_genre_list)

In [None]:
#resetting the index
genre_movies_test.reset_index(inplace=True)
genres_final = genre_movies_test.rename(columns = {'index':'genre'})

In [None]:
#genre + profit, genre + budget, genre + ROI, genre + profit_margin dataframes
genres_profit = genres_final.groupby('genres', as_index = False)['profit'].median().sort_values(by = 'profit', ascending = False)
genres_roi = genres_final.groupby('genres', as_index = False)['roi'].median().sort_values(by = 'roi', ascending = False)
genres_budget = genres_final.groupby('genres', as_index = False)['budget ($) millions'].median().sort_values(by = 'budget ($) millions', ascending = False)
genres_profit_margin = genres_final.groupby('genres', as_index = False)['profit_margin'].median().sort_values(by = 'profit_margin', ascending = False)

In [None]:
#plotting genre + profit, genre + budget, genre + ROI, genre + profit_margin dataframes
fig,ax =  plt.subplots(2, 2,figsize = (15, 8), constrained_layout = True)

sns.barplot(data = genres_profit[genres_profit['profit'] > 0], 
            x = 'profit', y = 'genres', palette = 'Greens_r', ax = ax[0][0])
sns.barplot(data = genres_roi[genres_roi['roi'] > 0], 
            x = 'roi', y = 'genres', palette = 'Blues_r', ax = ax[0][1])
sns.barplot(data = genres_budget, 
            x = 'budget ($) millions', y = 'genres', palette = 'Reds', ax = ax[1][0])
sns.barplot(data = genres_profit_margin[genres_profit_margin['profit_margin'] > 0], 
            x = 'profit_margin', y = 'genres', palette = 'YlOrRd', ax = ax[1][1]);

In [None]:
#visualizing top genres, and theiw average movie score

genres_score = genres_final.groupby('genres', as_index = False)['movie_score'].mean().sort_values(by='movie_score', ascending = False)
genres_score.reset_index(inplace = True)

plt.figure(figsize = (10,4))
sns.barplot(data = genres_score, x = 'movie_score', y = 'genres', palette = 'Dark2')
plt.xlabel('Movie Score', fontsize = 15)
plt.ylabel('Movie Score', fontsize = 15)
plt.show();

# **Is there a pattern in common plot keywords with successful movies?**

- No relevant conclusion was made

In [None]:
#creating a wordcloud function
def plot_wordcloud(wordcloud):
    """
    This function sets the plotting parameters for the given wordcloud.
    """
    plt.figure(figsize=(12, 8))
    plt.imshow(wordcloud)
    plt.title('Most common words:')
    plt.axis('off');

In [None]:
#plot keywords with profit_margin larger than 0.6
plotwords_profit_margin = movies[(movies['profit_margin'] > 0.6) & (movies['profit'] > 0)]['plot_keywords']

In [None]:
wordcloud_profit_margin = WordCloud(width = 1500, height = 500, 
                      random_state = 42, background_color = 'white', 
                      colormap = 'Accent', collocations = False, 
                      max_words = 50, stopwords = STOPWORDS).generate(' '.join(plotwords_profit_margin))
plot_wordcloud(wordcloud_profit_margin)

In [None]:
#plot keywords with top 4 genres Comedy, Action, Drama, and Adventure
top_4_genres = ['Action', 'Comedy', 'Adventure', 'Drama']
plotwords_genres = movies[movies['genres'].isin(top_4_genres)]['plot_keywords']

In [None]:
wordcloud_top_genres = WordCloud(width = 1500, height = 500, 
                      random_state = 42, background_color = 'white', 
                      colormap = 'Dark2', collocations = False, 
                      max_words = 50, stopwords = STOPWORDS).generate(' '.join(plotwords_genres))
plot_wordcloud(wordcloud_top_genres)

In [None]:
#plot keywords movies_profitable_sorted
plotwords_most_profitable = movies_profitable_sorted['plot_keywords'][:30]

In [None]:
wordcloud_top_most_profitable = WordCloud(width = 1500, height = 500, 
                      random_state = 42, background_color = 'white', 
                      colormap = 'Set2', collocations = False, 
                      max_words = 50, stopwords = STOPWORDS).generate(' '.join(plotwords_most_profitable))
plot_wordcloud(wordcloud_top_most_profitable)

In [None]:
#creating a lost of top 30 actors and their movies
list_top_30_actors = list(actor1_top30['actor_1_name'])
plotwords_top30_actor = movies[movies['actor_1_name'].isin(list_top_30_actors)]['plot_keywords']

In [None]:
wordcloud_top30_actor = WordCloud(width = 1500, height = 500, 
                      random_state = 42, background_color = 'white', 
                      colormap = 'tab20', collocations = False, 
                      max_words = 50, stopwords = STOPWORDS).generate(' '.join(plotwords_top30_actor))
plot_wordcloud(wordcloud_top30_actor)

In [None]:
#creating a wordcloud with most common plotkeywords for horror movies
plotwords_test = movies[movies['genres'] == 'Horror']['plot_keywords']

In [None]:
wordcloud_test = WordCloud(width = 1500, height = 500, 
                      random_state = 42, background_color = 'white', 
                      colormap = 'Set1', collocations = False, 
                      max_words = 50, stopwords = STOPWORDS).generate(' '.join(plotwords_test))
plot_wordcloud(wordcloud_test)

# **Are duration and content rating of the movie relevant to profit?**

**Conclusion**: Focus on PG-13 movies, as the most common profitable genres (Animation, Adventure, Family) are in this group.

**Duration**

In [None]:
#visualizing the distribution of movie's scores
plt.figure(figsize = (15,7))
sns.displot(movies['duration'], kde = True, height = 6, aspect = 2, color = 'darkcyan', bins = 40)
plt.title("Movie's Duration Distribution", fontsize = 16)
plt.xlabel("Movie's Duration in Minutes", fontsize = 16)
plt.ylabel('Number of movies', fontsize = 16);

In [None]:
#visualizing duration and movie score
plt.figure(figsize=(17,5))
sns.lineplot(data = movies, x = 'duration', y = 'movie_score', color = 'orchid')
plt.xlabel('Duration',fontsize = 15)
plt.ylabel('Movie Score',fontsize = 15)
plt.title('Movie Scores Compared to Duration', fontsize = 17);

In [None]:
#correlation between duration and: profit, profit margin and movie score
duration_corr = movies[['duration', 'profit', 'profit_margin','budget', 'movie_score']]
duration_corr.corr()

**Content rating**

In [None]:
#plotting content rating and movie count
rating_count = movies.groupby('content_rating', as_index = False)['movie_title'].count().sort_values(by='movie_title', ascending = False)[:4]

plt.figure(figsize=(17,5))
sns.barplot(x = 'movie_title', y = 'content_rating', data = rating_count, palette = 'Accent')
plt.title('Number of movies per Content Rating', fontsize = 17)
plt.xlabel('Movie Count', fontsize = 15)
plt.ylabel('Content Rating', fontsize = 15);

In [None]:
#creating an array of content rating values
rating_sep = movies.groupby('content_rating', as_index = False)['movie_title'].count().sort_values(by='movie_title', ascending = False)
rating_sep['content_rating'].values

In [None]:
#creating a list of content_rating that have too small sample size, and will not be taken into account when performing analysis
valid_sample_rating_list = list(rating_sep['content_rating'][-8:].values)

#setting genre to be index
rating_movies_test = movies.set_index('content_rating')

#dropping genres that have little to no sample size for further analysis
rating_movies_test = rating_movies_test.drop(valid_sample_rating_list)

In [None]:
#resetting the index
rating_movies_test.reset_index(inplace=True)
rating_final = rating_movies_test.rename(columns = {'index':'content_rating'})

In [None]:
plt.figure(figsize=(10,4))
sns.barplot(data = rating_final, x = 'content_rating', y = 'movie_score', palette = 'Accent')
plt.xlabel('Content Rating',fontsize = 12)
plt.ylabel('Movie Score',fontsize = 12)
plt.title('Movie Scores Compared to Top 4 Content Ratings', fontsize = 15);

In [None]:
#content rating and profit, profit_margin,score, and budget

rating_profit = rating_final.groupby('content_rating', as_index = False)[['profit', 
                                                  'profit_margin', 
                                                  'budget ($) millions', 
                                                  'movie_score']].median().sort_values(by = 'profit', ascending = False)
rating_profit

# **How do number of users voted, and number of critical reviews impact profit?**

**Conclusion:** There are no significant strategies to be recommended without having specific demographic information about users voted, users reviews, and number of critic reviews.

In [None]:
#mean and median values
movies[['num_critic_reviews','num_voted_mill','num_users_voted','num_user_reviews']].describe()

In [None]:
#top 10 movies with most vote counts
top10_votes = movies.groupby('movie_title',as_index = False)['profit', 'num_voted_mill'].sum().sort_values(by = 'num_voted_mill', ascending = False)[:10]

top10_votes

In [None]:
#visualizing top 10 voted movies with their profits
plt.figure(figsize = (15,7))
ax = sns.barplot(data = top10_votes, x = 'movie_title', y =  'num_voted_mill')

for p, value in zip(ax.patches, top10_votes['profit']):
    ax.annotate("%.2f" % value, xy=(p.get_x()+p.get_width()/2, p.get_height()),
                ha='center', va='bottom')

plt.xlabel('Movie Title',fontsize = 12)
plt.xticks(rotation = 90, fontsize = 13)
plt.ylabel('Number of Votes in Millions',fontsize = 12)

plt.title('Top 10 Movies with the highest number of votes and their profit values', fontsize = 15);
# plt.savefig('top10_votes_profit.png');

In [None]:
#assessing correlation between number of votes, critical reviews and users voted with profit and profit margin
reviews_cotes_profit_corr = movies[['num_critic_reviews', 'num_users_voted', 'num_user_reviews',
                                   'profit', 'profit_margin']]
reviews_cotes_profit_corr.corr()

**Genres and Number of user voted, number of user reviews**

In [None]:
genre_reviews = genres_final.groupby('genres', 
                                     as_index = False)[['num_users_voted',
                                                        'num_user_reviews',
                                                        'num_critic_reviews']].sum().sort_values(by = 'num_users_voted', ascending = False)

In [None]:
#visualizing Genres and Number users reviewed and critic reviews
plt.figure(figsize=(15,6))

sns.barplot(data = genre_reviews, x = 'genres', y = 'num_user_reviews', color = 'cyan', label = 'User Reviews', ci = None)            
sns.barplot(data = genre_reviews, x = 'genres', y = 'num_critic_reviews', color = 'yellow', label = 'Critic reviews', ci = None)            

plt.xlabel('Genres', fontsize = 16)
plt.title('Genres and Number users reviewed and critic reviews', fontsize = 16)
plt.ylabel('User reviews and Critic reviews', fontsize = 16)
plt.xticks(rotation = 45, fontsize = 12)
plt.legend(fontsize = 16);
# plt.savefig('reviews_and_critic_genres.png')

In [None]:
#visualizing number of users voted per genre
plt.figure(figsize=(15,6))
sns.barplot(data = genre_reviews, x = 'genres', y = 'num_users_voted', color = 'green', label = 'Users Voted', ci = None)
plt.xlabel('Genres', fontsize = 16)
plt.title('Number of users voted', fontsize = 16)
plt.ylabel('Number of votes per genre', fontsize = 16)
plt.xticks(rotation = 45, fontsize = 12);

**Profit and Number of user voted, number of user reviews**

In [None]:
#visualizing profit and number of users voted
plt.figure(figsize=(17,5))
sns.regplot(data = movies[movies['profit'] > 0], 
             x = 'profit', y = 'num_voted_mill', color = 'green')
plt.xlabel('Profit in USD Millions',fontsize = 15)
plt.ylabel('Number of users voted in Millions',fontsize = 15)
plt.title('Profit and number of users voted', fontsize = 17)
# plt.savefig('reviews_and_critic_genres.png');