In [2]:
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Import necessary libraries
import numpy as np
import pandas as pd

try:
    # Load the CSV file into a DataFrame
    movies = pd.read_csv('movie_data.csv')
    # Display the DataFrame
    print(movies)
except FileNotFoundError:
    print("File not found. Please check the file path.")
except Exception as e:
    print("An error occurred:", e)


      color      director_name  num_critic_for_reviews  duration  \
0     Color      James Cameron                   723.0     178.0   
1     Color     Gore Verbinski                   302.0     169.0   
2     Color         Sam Mendes                   602.0     148.0   
3     Color  Christopher Nolan                   813.0     164.0   
4       NaN        Doug Walker                     NaN       NaN   
...     ...                ...                     ...       ...   
5038  Color        Scott Smith                     1.0      87.0   
5039  Color                NaN                    43.0      43.0   
5040  Color   Benjamin Roberds                    13.0      76.0   
5041  Color        Daniel Hsia                    14.0     100.0   
5042  Color           Jon Gunn                    43.0      90.0   

      director_facebook_likes  actor_3_facebook_likes      actor_2_name  \
0                         0.0                   855.0  Joel David Moore   
1                       563.0    

In [3]:
# 1.2: Inspect the dataframe

# Import necessary libraries
import numpy as np
import pandas as pd

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

try:
    # Load the CSV file into a DataFrame
    movies = pd.read_csv('movie_data.csv')
    
    # Inspect the dataframe
    print("Statistical data:")
    print(movies.describe())  # Observes statistical data like percentile, min, max, count, mean, and std of the numerical values of the Series or DataFrame
    
    print("\nColumn names:")
    print(movies.columns)     # Prints the columns' names
    
    print("\nDataframe type:")
    print(type(movies))       # Prints the datatype of the dataframe
    
    print("\nDataframe shape:")
    print(movies.shape)       # Prints the shape (row*column) of the dataframe
    
    print("\nDataframe info:")
    print(movies.info())      # Prints a concise summary of the DataFrame
    
except FileNotFoundError:
    print("File not found. Please check the file path.")
except Exception as e:
    print("An error occurred:", e)

Statistical data:
       num_critic_for_reviews     duration  director_facebook_likes  \
count             4993.000000  5028.000000              4939.000000   
mean               140.194272   107.201074               686.509212   
std                121.601675    25.197441              2813.328607   
min                  1.000000     7.000000                 0.000000   
25%                 50.000000    93.000000                 7.000000   
50%                110.000000   103.000000                49.000000   
75%                195.000000   118.000000               194.500000   
max                813.000000   511.000000             23000.000000   

       actor_3_facebook_likes  actor_1_facebook_likes         gross  \
count             5020.000000             5036.000000  4.159000e+03   
mean               645.009761             6560.047061  4.846841e+07   
std               1665.041728            15020.759120  6.845299e+07   
min                  0.000000                0.000000  1.6

In [4]:
# 2.1: Inspect Null values

# Find out the number of Null values in all the columns and rows. 
# Also, find the percentage of Null values in each column. 
# Round off the percentages up to two decimal places.

# Column-wise null count
null_counts = movies.isnull().sum()                  # Finding the number of null values in each column
null_percentages = round((null_counts / len(movies)) * 100, 2)   # Calculating the percentage of null values in each column
null_summary = pd.DataFrame({'Null Count': null_counts, 'Null Percentage': null_percentages})
print(null_summary)


                           Null Count  Null Percentage
color                              19             0.38
director_name                     104             2.06
num_critic_for_reviews             50             0.99
duration                           15             0.30
director_facebook_likes           104             2.06
actor_3_facebook_likes             23             0.46
actor_2_name                       13             0.26
actor_1_facebook_likes              7             0.14
gross                             884            17.53
genres                              0             0.00
actor_1_name                        7             0.14
movie_title                         0             0.00
num_voted_users                     0             0.00
cast_total_facebook_likes           0             0.00
actor_3_name                       23             0.46
facenumber_in_poster               13             0.26
plot_keywords                     153             3.03
movie_imdb

In [5]:
# 2.1: Inspect Null values

# Row-wise null count here
null_counts_row = movies.isnull().sum(axis=1)   # Finding the number of null values in each row
null_counts_row


0        0
1        0
2        0
3        0
4       14
        ..
5038     4
5039     5
5040     4
5041     2
5042     0
Length: 5043, dtype: int64

In [9]:
# 2.1: Inspect Null values

# Column-wise null percentages here
null_col = movies.isnull().sum(axis=0)                   # Finding the number of null values in each column
percent_col = (null_col / len(movies)) * 100             # Calculating the percentage of null values in each column
percent_col = round(percent_col, 2)                      # Rounding off the percentages to two decimal places
percent_col

color                         0.38
director_name                 2.06
num_critic_for_reviews        0.99
duration                      0.30
director_facebook_likes       2.06
actor_3_facebook_likes        0.46
actor_2_name                  0.26
actor_1_facebook_likes        0.14
gross                        17.53
genres                        0.00
actor_1_name                  0.14
movie_title                   0.00
num_voted_users               0.00
cast_total_facebook_likes     0.00
actor_3_name                  0.46
facenumber_in_poster          0.26
plot_keywords                 3.03
movie_imdb_link               0.00
num_user_for_reviews          0.42
language                      0.28
country                       0.10
content_rating                6.01
budget                        9.76
title_year                    2.14
actor_2_facebook_likes        0.26
imdb_score                    0.00
aspect_ratio                  6.52
movie_facebook_likes          0.00
dtype: float64

In [6]:
# 2.2: Drop unnecessary columns

# List of columns to be dropped
columns_to_drop = ['color', 'director_facebook_likes', 'actor_1_facebook_likes', 'actor_2_facebook_likes',
                   'actor_3_facebook_likes', 'actor_2_name', 'cast_total_facebook_likes', 'actor_3_name',
                   'duration', 'facenumber_in_poster', 'content_rating', 'country', 'movie_imdb_link',
                   'aspect_ratio', 'plot_keywords']

# Dropping the columns
movies.drop(columns=columns_to_drop, inplace=True)

# Inspecting the dataframe after dropping columns
movies.head()


Unnamed: 0,director_name,num_critic_for_reviews,gross,genres,actor_1_name,movie_title,num_voted_users,num_user_for_reviews,language,budget,title_year,imdb_score,movie_facebook_likes
0,James Cameron,723.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,3054.0,English,237000000.0,2009.0,7.9,33000
1,Gore Verbinski,302.0,309404152.0,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,1238.0,English,300000000.0,2007.0,7.1,0
2,Sam Mendes,602.0,200074175.0,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,994.0,English,245000000.0,2015.0,6.8,85000
3,Christopher Nolan,813.0,448130642.0,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,2701.0,English,250000000.0,2012.0,8.5,164000
4,Doug Walker,,,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens ...,8,,,,,7.1,0


In [11]:
# 2.3: Drop rows with null values in columns with high null percentages

# Dropping rows with null values in 'gross' and 'budget' columns
movies.dropna(subset=['gross', 'budget'], inplace=True)

# Inspecting the dataframe after dropping rows
movies.head()


Unnamed: 0,director_name,num_critic_for_reviews,gross,genres,actor_1_name,movie_title,num_voted_users,num_user_for_reviews,language,budget,title_year,imdb_score,movie_facebook_likes
0,James Cameron,723.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,3054.0,English,237000000.0,2009.0,7.9,33000
1,Gore Verbinski,302.0,309404152.0,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,1238.0,English,300000000.0,2007.0,7.1,0
2,Sam Mendes,602.0,200074175.0,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,994.0,English,245000000.0,2015.0,6.8,85000
3,Christopher Nolan,813.0,448130642.0,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,2701.0,English,250000000.0,2012.0,8.5,164000
5,Andrew Stanton,462.0,73058679.0,Action|Adventure|Sci-Fi,Daryl Sabara,John Carter,212204,738.0,English,263700000.0,2012.0,6.6,24000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5033,Shane Carruth,143.0,424760.0,Drama|Sci-Fi|Thriller,Shane Carruth,Primer,72639,371.0,English,7000.0,2004.0,7.0,19000
5034,Neill Dela Llana,35.0,70071.0,Thriller,Ian Gamazon,Cavite,589,35.0,English,7000.0,2005.0,6.3,74
5035,Robert Rodriguez,56.0,2040920.0,Action|Crime|Drama|Romance|Thriller,Carlos Gallardo,El Mariachi,52055,130.0,Spanish,7000.0,1992.0,6.9,0
5037,Edward Burns,14.0,4584.0,Comedy|Drama,Kerry Bishé,Newlyweds,1338,14.0,English,9000.0,2011.0,6.4,413


In [7]:
# 2.4: Drop rows with greater than five NaN values

# Counting the number of NaN values in each row
null_counts_row = movies.isnull().sum(axis=1)

# Dropping rows with more than five NaN values
rows_to_drop = null_counts_row[null_counts_row > 5].index
movies.drop(index=rows_to_drop, inplace=True)

# Inspecting the dataframe after dropping rows
movies.head()


Unnamed: 0,director_name,num_critic_for_reviews,gross,genres,actor_1_name,movie_title,num_voted_users,num_user_for_reviews,language,budget,title_year,imdb_score,movie_facebook_likes
0,James Cameron,723.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,3054.0,English,237000000.0,2009.0,7.9,33000
1,Gore Verbinski,302.0,309404152.0,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,1238.0,English,300000000.0,2007.0,7.1,0
2,Sam Mendes,602.0,200074175.0,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,994.0,English,245000000.0,2015.0,6.8,85000
3,Christopher Nolan,813.0,448130642.0,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,2701.0,English,250000000.0,2012.0,8.5,164000
5,Andrew Stanton,462.0,73058679.0,Action|Adventure|Sci-Fi,Daryl Sabara,John Carter,212204,738.0,English,263700000.0,2012.0,6.6,24000


In [8]:
# 2.5: Fill NaN values in the 'language' column with 'English'

# Filling NaN values in the 'language' column with 'English'
movies['language'].fillna('English', inplace=True)

# Inspecting the dataframe after filling NaN values
movies.head()

Unnamed: 0,director_name,num_critic_for_reviews,gross,genres,actor_1_name,movie_title,num_voted_users,num_user_for_reviews,language,budget,title_year,imdb_score,movie_facebook_likes
0,James Cameron,723.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,3054.0,English,237000000.0,2009.0,7.9,33000
1,Gore Verbinski,302.0,309404152.0,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,1238.0,English,300000000.0,2007.0,7.1,0
2,Sam Mendes,602.0,200074175.0,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,994.0,English,245000000.0,2015.0,6.8,85000
3,Christopher Nolan,813.0,448130642.0,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,2701.0,English,250000000.0,2012.0,8.5,164000
5,Andrew Stanton,462.0,73058679.0,Action|Adventure|Sci-Fi,Daryl Sabara,John Carter,212204,738.0,English,263700000.0,2012.0,6.6,24000


In [9]:
# 2.6: Check the number of retained rows

# Printing the number of retained rows
print("Number of retained rows:", len(movies))

# Printing the percentage of retained rows
retained_percentage = (len(movies) / 5043) * 100
print("Percentage of retained rows:", retained_percentage)


Number of retained rows: 5039
Percentage of retained rows: 99.9206821336506


In [10]:
# 3.1: Change the unit of columns

# Converting the unit of 'budget' and 'gross' columns from $ to million $
movies['budget'] = movies['budget'] / 1000000
movies['gross'] = movies['gross'] / 1000000

# Displaying the dataframe after unit conversion
movies.head()


Unnamed: 0,director_name,num_critic_for_reviews,gross,genres,actor_1_name,movie_title,num_voted_users,num_user_for_reviews,language,budget,title_year,imdb_score,movie_facebook_likes
0,James Cameron,723.0,760.505847,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,3054.0,English,237.0,2009.0,7.9,33000
1,Gore Verbinski,302.0,309.404152,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,1238.0,English,300.0,2007.0,7.1,0
2,Sam Mendes,602.0,200.074175,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,994.0,English,245.0,2015.0,6.8,85000
3,Christopher Nolan,813.0,448.130642,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,2701.0,English,250.0,2012.0,8.5,164000
5,Andrew Stanton,462.0,73.058679,Action|Adventure|Sci-Fi,Daryl Sabara,John Carter,212204,738.0,English,263.7,2012.0,6.6,24000


In [11]:
# 3.2: Find the movies with the highest profit

# Creating a new column 'profit' which contains the difference of 'gross' and 'budget'
movies['profit'] = movies['gross'] - movies['budget']

# Sorting the dataframe by 'profit' column in descending order
movies_sorted = movies.sort_values(by='profit', ascending=False)

# Extracting the top ten profiting movies
top10 = movies_sorted.head(10)

# Displaying the top ten profiting movies
top10


Unnamed: 0,director_name,num_critic_for_reviews,gross,genres,actor_1_name,movie_title,num_voted_users,num_user_for_reviews,language,budget,title_year,imdb_score,movie_facebook_likes,profit
0,James Cameron,723.0,760.505847,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,3054.0,English,237.0,2009.0,7.9,33000,523.505847
29,Colin Trevorrow,644.0,652.177271,Action|Adventure|Sci-Fi|Thriller,Bryce Dallas Howard,Jurassic World,418214,1290.0,English,150.0,2015.0,7.0,150000,502.177271
26,James Cameron,315.0,658.672302,Drama|Romance,Leonardo DiCaprio,Titanic,793059,2528.0,English,200.0,1997.0,7.7,26000,458.672302
3024,George Lucas,282.0,460.935665,Action|Adventure|Fantasy|Sci-Fi,Harrison Ford,Star Wars: Episode IV - A New Hope,911097,1470.0,English,11.0,1977.0,8.7,33000,449.935665
3080,Steven Spielberg,215.0,434.949459,Family|Sci-Fi,Henry Thomas,E.T. the Extra-Terrestrial,281842,515.0,English,10.5,1982.0,7.9,34000,424.449459
794,Joss Whedon,703.0,623.279547,Action|Adventure|Sci-Fi,Chris Hemsworth,The Avengers,995415,1722.0,English,220.0,2012.0,8.1,123000,403.279547
17,Joss Whedon,703.0,623.279547,Action|Adventure|Sci-Fi,Chris Hemsworth,The Avengers,995415,1722.0,English,220.0,2012.0,8.1,123000,403.279547
509,Roger Allers,186.0,422.783777,Adventure|Animation|Drama|Family|Musical,Matthew Broderick,The Lion King,644348,656.0,English,45.0,1994.0,8.5,17000,377.783777
240,George Lucas,320.0,474.544677,Action|Adventure|Fantasy|Sci-Fi,Natalie Portman,Star Wars: Episode I - The Phantom Menace,534658,3597.0,English,115.0,1999.0,6.5,13000,359.544677
66,Christopher Nolan,645.0,533.316061,Action|Crime|Drama|Thriller,Christian Bale,The Dark Knight,1676169,4667.0,English,185.0,2008.0,9.0,37000,348.316061


In [15]:
# Sorting the dataframe by 'profit' column in descending order
movies.sort_values(by='profit', ascending=False, inplace=True)

# Displaying the sorted dataframe
movies


Unnamed: 0,director_name,num_critic_for_reviews,gross,genres,actor_1_name,movie_title,num_voted_users,num_user_for_reviews,language,budget,title_year,imdb_score,movie_facebook_likes,profit
0,James Cameron,723.0,760.505847,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,3054.0,English,237.000000,2009.0,7.9,33000,523.505847
29,Colin Trevorrow,644.0,652.177271,Action|Adventure|Sci-Fi|Thriller,Bryce Dallas Howard,Jurassic World,418214,1290.0,English,150.000000,2015.0,7.0,150000,502.177271
26,James Cameron,315.0,658.672302,Drama|Romance,Leonardo DiCaprio,Titanic,793059,2528.0,English,200.000000,1997.0,7.7,26000,458.672302
3024,George Lucas,282.0,460.935665,Action|Adventure|Fantasy|Sci-Fi,Harrison Ford,Star Wars: Episode IV - A New Hope,911097,1470.0,English,11.000000,1977.0,8.7,33000,449.935665
3080,Steven Spielberg,215.0,434.949459,Family|Sci-Fi,Henry Thomas,E.T. the Extra-Terrestrial,281842,515.0,English,10.500000,1982.0,7.9,34000,424.449459
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2334,Katsuhiro Ôtomo,105.0,0.410388,Action|Adventure|Animation|Family|Sci-Fi|Thriller,William Hootkins,Steamboy,13727,79.0,Japanese,2127.519898,2004.0,6.9,973,-2127.109510
2323,Hayao Miyazaki,174.0,2.298191,Adventure|Animation|Fantasy,Minnie Driver,Princess Mononoke,221552,570.0,Japanese,2400.000000,1997.0,8.4,11000,-2397.701809
3005,Lajos Koltai,73.0,0.195888,Drama|Romance|War,Marcell Nagy,Fateless,5603,45.0,Hungarian,2500.000000,2005.0,7.1,607,-2499.804112
3859,Chan-wook Park,202.0,0.211667,Crime|Drama,Min-sik Choi,Lady Vengeance,53508,131.0,Korean,4200.000000,2005.0,7.7,4000,-4199.788333


In [12]:
# Getting the top 10 profiting movies
top10 = movies.head(10)

# Displaying the top 10 profiting movies
top10


Unnamed: 0,director_name,num_critic_for_reviews,gross,genres,actor_1_name,movie_title,num_voted_users,num_user_for_reviews,language,budget,title_year,imdb_score,movie_facebook_likes,profit
0,James Cameron,723.0,760.505847,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,3054.0,English,237.0,2009.0,7.9,33000,523.505847
1,Gore Verbinski,302.0,309.404152,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,1238.0,English,300.0,2007.0,7.1,0,9.404152
2,Sam Mendes,602.0,200.074175,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,994.0,English,245.0,2015.0,6.8,85000,-44.925825
3,Christopher Nolan,813.0,448.130642,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,2701.0,English,250.0,2012.0,8.5,164000,198.130642
5,Andrew Stanton,462.0,73.058679,Action|Adventure|Sci-Fi,Daryl Sabara,John Carter,212204,738.0,English,263.7,2012.0,6.6,24000,-190.641321
6,Sam Raimi,392.0,336.530303,Action|Adventure|Romance,J.K. Simmons,Spider-Man 3,383056,1902.0,English,258.0,2007.0,6.2,0,78.530303
7,Nathan Greno,324.0,200.807262,Adventure|Animation|Comedy|Family|Fantasy|Musi...,Brad Garrett,Tangled,294810,387.0,English,260.0,2010.0,7.8,29000,-59.192738
8,Joss Whedon,635.0,458.991599,Action|Adventure|Sci-Fi,Chris Hemsworth,Avengers: Age of Ultron,462669,1117.0,English,250.0,2015.0,7.5,118000,208.991599
9,David Yates,375.0,301.95698,Adventure|Family|Fantasy|Mystery,Alan Rickman,Harry Potter and the Half-Blood Prince,321795,973.0,English,250.0,2009.0,7.5,10000,51.95698
10,Zack Snyder,673.0,330.249062,Action|Adventure|Sci-Fi,Henry Cavill,Batman v Superman: Dawn of Justice,371639,3018.0,English,250.0,2016.0,6.9,197000,80.249062


In [13]:
# 3.3: Drop duplicate values

# Dropping duplicate rows from the dataframe
movies.drop_duplicates(inplace=True)

# Displaying the dataframe after dropping duplicates
movies


Unnamed: 0,director_name,num_critic_for_reviews,gross,genres,actor_1_name,movie_title,num_voted_users,num_user_for_reviews,language,budget,title_year,imdb_score,movie_facebook_likes,profit
0,James Cameron,723.0,760.505847,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,3054.0,English,237.0000,2009.0,7.9,33000,523.505847
1,Gore Verbinski,302.0,309.404152,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,1238.0,English,300.0000,2007.0,7.1,0,9.404152
2,Sam Mendes,602.0,200.074175,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,994.0,English,245.0000,2015.0,6.8,85000,-44.925825
3,Christopher Nolan,813.0,448.130642,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,2701.0,English,250.0000,2012.0,8.5,164000,198.130642
5,Andrew Stanton,462.0,73.058679,Action|Adventure|Sci-Fi,Daryl Sabara,John Carter,212204,738.0,English,263.7000,2012.0,6.6,24000,-190.641321
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5038,Scott Smith,1.0,,Comedy|Drama,Eric Mabius,Signed Sealed Delivered,629,6.0,English,,2013.0,7.7,84,
5039,,43.0,,Crime|Drama|Mystery|Thriller,Natalie Zea,The Following,73839,359.0,English,,,7.5,32000,
5040,Benjamin Roberds,13.0,,Drama|Horror|Thriller,Eva Boehnke,A Plague So Pleasant,38,3.0,English,0.0014,2013.0,6.3,16,
5041,Daniel Hsia,14.0,0.010443,Comedy|Drama|Romance,Alan Ruck,Shanghai Calling,1255,9.0,English,,2012.0,6.3,660,


In [16]:
# Add a new column 'profit' to the dataframe
movies['profit'] = movies['gross'] - movies['budget']

# Display the dataframe with the new 'profit' column
movies


Unnamed: 0,director_name,num_critic_for_reviews,gross,genres,actor_1_name,movie_title,num_voted_users,num_user_for_reviews,language,budget,title_year,imdb_score,movie_facebook_likes,profit
0,James Cameron,723.0,760.505847,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,3054.0,English,237.0000,2009.0,7.9,33000,523.505847
1,Gore Verbinski,302.0,309.404152,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,1238.0,English,300.0000,2007.0,7.1,0,9.404152
2,Sam Mendes,602.0,200.074175,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,994.0,English,245.0000,2015.0,6.8,85000,-44.925825
3,Christopher Nolan,813.0,448.130642,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,2701.0,English,250.0000,2012.0,8.5,164000,198.130642
5,Andrew Stanton,462.0,73.058679,Action|Adventure|Sci-Fi,Daryl Sabara,John Carter,212204,738.0,English,263.7000,2012.0,6.6,24000,-190.641321
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5038,Scott Smith,1.0,,Comedy|Drama,Eric Mabius,Signed Sealed Delivered,629,6.0,English,,2013.0,7.7,84,
5039,,43.0,,Crime|Drama|Mystery|Thriller,Natalie Zea,The Following,73839,359.0,English,,,7.5,32000,
5040,Benjamin Roberds,13.0,,Drama|Horror|Thriller,Eva Boehnke,A Plague So Pleasant,38,3.0,English,0.0014,2013.0,6.3,16,
5041,Daniel Hsia,14.0,0.010443,Comedy|Drama|Romance,Alan Ruck,Shanghai Calling,1255,9.0,English,,2012.0,6.3,660,


In [17]:
# 3.4: Find IMDb Top 250

# Sorting the movies dataframe by 'imdb_score' in descending order
movies_sorted_by_imdb = movies.sort_values(by='imdb_score', ascending=False)

# Filtering the top 250 movies with 'num_voted_users' greater than 25,000
IMDb_Top_250 = movies_sorted_by_imdb[movies_sorted_by_imdb['num_voted_users'] > 25000].head(250)

# Adding a 'Rank' column to indicate the ranks of the movies
IMDb_Top_250['Rank'] = range(1, 251)

# Displaying the IMDb_Top_250 dataframe
IMDb_Top_250


Unnamed: 0,director_name,num_critic_for_reviews,gross,genres,actor_1_name,movie_title,num_voted_users,num_user_for_reviews,language,budget,title_year,imdb_score,movie_facebook_likes,profit,Rank
1937,Frank Darabont,199.0,28.341469,Crime|Drama,Morgan Freeman,The Shawshank Redemption,1689764,4144.0,English,25.000000,1994.0,9.3,108000,3.341469,1
3466,Francis Ford Coppola,208.0,134.821952,Crime|Drama,Al Pacino,The Godfather,1155770,2238.0,English,6.000000,1972.0,9.2,43000,128.821952,2
2837,Francis Ford Coppola,149.0,57.300000,Crime|Drama,Robert De Niro,The Godfather: Part II,790926,650.0,English,13.000000,1974.0,9.0,14000,44.300000,3
3481,,54.0,,Crime|Drama|Thriller,Kirsten Dunst,Fargo,170055,173.0,English,,,9.0,61000,,4
66,Christopher Nolan,645.0,533.316061,Action|Crime|Drama|Thriller,Christian Bale,The Dark Knight,1676169,4667.0,English,185.000000,2008.0,9.0,37000,348.316061,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223,Ang Lee,552.0,124.976634,Adventure|Drama|Fantasy,Suraj Sharma,Life of Pi,440084,755.0,English,120.000000,2012.0,8.0,122000,4.976634,246
3741,Edgar Wright,246.0,13.464388,Comedy|Horror,Peter Serafinowicz,Shaun of the Dead,395921,859.0,English,4.000000,2004.0,8.0,19000,9.464388,247
2944,Martin Campbell,400.0,167.007184,Action|Adventure|Thriller,Eva Green,Casino Royale,470501,2301.0,English,150.000000,2006.0,8.0,0,17.007184,248
3630,Sam Peckinpah,147.0,,Action|Adventure|Western,William Holden,The Wild Bunch,63192,287.0,English,6.244087,1969.0,8.0,0,,249


In [38]:
# Extracting the top foreign language films from the IMDb_Top_250 dataframe
Top_Foreign_Lang_Film = IMDb_Top_250[IMDb_Top_250['language'] != 'English']

# Displaying the top foreign language films
Top_Foreign_Lang_Film.head(91)

Unnamed: 0,Rank,director_name,num_critic_for_reviews,gross,genres,actor_1_name,movie_title,num_voted_users,num_user_for_reviews,language,budget,title_year,imdb_score,movie_facebook_likes,profit
4498,7,Sergio Leone,181.0,6.100000,Western,Clint Eastwood,"The Good, the Bad and the Ugly",503509,780.0,Italian,1.2,1966.0,8.9,20000,4.900000
4029,17,Fernando Meirelles,214.0,7.563397,Crime|Drama,Alice Braga,City of God,533200,749.0,Portuguese,3.3,2002.0,8.7,28000,4.263397
4747,19,Akira Kurosawa,153.0,0.269061,Action|Adventure|Drama,Takashi Shimura,Seven Samurai,229012,596.0,Japanese,2.0,1954.0,8.7,11000,-1.730939
2373,22,Hayao Miyazaki,246.0,10.049886,Adventure|Animation|Family|Fantasy,Bunta Sugawara,Spirited Away,417971,902.0,Japanese,19.0,2001.0,8.6,28000,-8.950114
4259,29,Florian Henckel von Donnersmarck,215.0,11.284657,Drama|Thriller,Sebastian Koch,The Lives of Others,259379,407.0,German,2.0,2006.0,8.5,39000,9.284657
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3883,1525,Timur Bekmambetov,232.0,1.487477,Fantasy|Thriller,Konstantin Khabenskiy,Night Watch,47097,310.0,Russian,4.2,2004.0,6.5,0,-2.712523
377,1639,Sydney Pollack,227.0,72.515360,Crime|Mystery|Thriller,Curtiss Cook,The Interpreter,86152,411.0,Aboriginal,80.0,2005.0,6.4,0,-7.484640
4671,1708,Tommy Wirkola,224.0,0.041709,Comedy|Horror,Bjørn Sundquist,Dead Snow,54601,200.0,Norwegian,0.8,2009.0,6.4,23000,-0.758291
484,2090,Martin Campbell,137.0,45.356386,Action|Adventure|Western,Michael Emerson,The Legend of Zorro,71574,244.0,Spanish,75.0,2005.0,5.9,951,-29.643614


In [18]:
# 3.5: Find the best directors

# Grouping the dataframe by 'director_name' and calculating the mean of 'imdb_score' for each director
director_grouped = movies.groupby('director_name')['imdb_score'].mean()

# Sorting the directors based on the mean 'imdb_score' in descending order and selecting the top 10 directors
top10director = pd.DataFrame(director_grouped.sort_values(ascending=False).head(10))

# Displaying the top 10 directors
top10director


Unnamed: 0_level_0,imdb_score
director_name,Unnamed: 1_level_1
John Blanchard,9.5
Cary Bell,8.7
Mitchell Altieri,8.7
Sadyk Sher-Niyaz,8.7
Mike Mayhall,8.6
Charles Chaplin,8.6
Ron Fricke,8.5
Majid Majidi,8.5
Raja Menon,8.5
Damien Chazelle,8.5


In [21]:
# 3.6: Find popular genres

# Extracting the first two genres from the 'genres' column
genres_split = movies['genres'].str.split('|', n=1, expand=True)
movies['genre_1'] = genres_split[0]
movies['genre_2'] = genres_split[1].fillna(genres_split[0])

# Displaying the first two genres of each movie
print(movies['genre_1'])
print(movies['genre_2'])


0            Action
1            Action
2            Action
3            Action
5            Action
           ...     
5038         Comedy
5039          Crime
5040          Drama
5041         Comedy
5042    Documentary
Name: genre_1, Length: 4992, dtype: object
0       Adventure|Fantasy|Sci-Fi
1              Adventure|Fantasy
2             Adventure|Thriller
3                       Thriller
5               Adventure|Sci-Fi
                  ...           
5038                       Drama
5039      Drama|Mystery|Thriller
5040             Horror|Thriller
5041               Drama|Romance
5042                 Documentary
Name: genre_2, Length: 4992, dtype: object


In [23]:
# Grouping the dataframe by 'genre_1' and 'genre_2'
movies_by_segment = movies.groupby(['genre_1', 'genre_2'])

# Displaying the grouped dataframe
movies_by_segment


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000021FDA78EDE0>

In [24]:
# Finding the 5 most popular combo of genres by mean gross
PopGenre = pd.DataFrame(movies_by_segment['gross'].mean().sort_values(ascending=False).head(5))

# Displaying the 5 most popular genre combinations
PopGenre


Unnamed: 0_level_0,Unnamed: 1_level_0,gross
genre_1,genre_2,Unnamed: 2_level_1
Family,Sci-Fi,434.949459
Adventure,Animation|Drama|Family|Musical,422.783777
Adventure,Animation|Comedy|Drama|Family|Fantasy,356.454367
Action,Biography|Drama|History|Thriller|War,350.123553
Action,Adventure|Fantasy|Sci-Fi,296.684758


In [25]:
# Creating the Meryl_Streep dataframe
Meryl_Streep = movies[movies['actor_1_name'] == 'Meryl Streep']

# Displaying the Meryl_Streep dataframe
Meryl_Streep


Unnamed: 0,director_name,num_critic_for_reviews,gross,genres,actor_1_name,movie_title,num_voted_users,num_user_for_reviews,language,budget,title_year,imdb_score,movie_facebook_likes,profit,genre_1,genre_2
410,Nancy Meyers,187.0,112.70347,Comedy|Drama|Romance,Meryl Streep,It's Complicated,69860,214.0,English,85.0,2009.0,6.6,0,27.70347,Comedy,Drama|Romance
1106,Curtis Hanson,42.0,46.815748,Action|Adventure|Crime|Thriller,Meryl Streep,The River Wild,32544,69.0,English,45.0,1994.0,6.3,0,1.815748,Action,Adventure|Crime|Thriller
1204,Nora Ephron,252.0,94.125426,Biography|Drama|Romance,Meryl Streep,Julie & Julia,79264,277.0,English,40.0,2009.0,7.0,13000,54.125426,Biography,Drama|Romance
1408,David Frankel,208.0,124.732962,Comedy|Drama|Romance,Meryl Streep,The Devil Wears Prada,286178,631.0,English,35.0,2006.0,6.8,0,89.732962,Comedy,Drama|Romance
1483,Robert Redford,227.0,14.99807,Drama|Thriller|War,Meryl Streep,Lions for Lambs,41170,298.0,English,35.0,2007.0,6.2,0,-20.00193,Drama,Thriller|War
1575,Sydney Pollack,66.0,87.1,Biography|Drama|Romance,Meryl Streep,Out of Africa,52339,200.0,English,31.0,1985.0,7.2,0,56.1,Biography,Drama|Romance
1618,David Frankel,234.0,63.536011,Comedy|Drama|Romance,Meryl Streep,Hope Springs,34258,178.0,English,30.0,2012.0,6.3,0,33.536011,Comedy,Drama|Romance
1674,Carl Franklin,64.0,23.20944,Drama,Meryl Streep,One True Thing,9283,112.0,English,30.0,1998.0,7.0,592,-6.79056,Drama,Drama
1752,Stephen Frears,87.0,,Biography|Comedy|Drama|Music|Romance,Meryl Streep,Florence Foster Jenkins,2167,32.0,English,29.0,2016.0,7.1,0,,Biography,Comedy|Drama|Music|Romance
1925,Stephen Daldry,174.0,41.59783,Drama|Romance,Meryl Streep,The Hours,102123,660.0,English,25.0,2002.0,7.6,0,16.59783,Drama,Romance


In [26]:
# Creating the Leo_Caprio dataframe
Leo_Caprio = movies[movies['actor_1_name'] == 'Leonardo DiCaprio']

# Displaying the Leo_Caprio dataframe
Leo_Caprio


Unnamed: 0,director_name,num_critic_for_reviews,gross,genres,actor_1_name,movie_title,num_voted_users,num_user_for_reviews,language,budget,title_year,imdb_score,movie_facebook_likes,profit,genre_1,genre_2
26,James Cameron,315.0,658.672302,Drama|Romance,Leonardo DiCaprio,Titanic,793059,2528.0,English,200.0,1997.0,7.7,26000,458.672302,Drama,Romance
50,Baz Luhrmann,490.0,144.812796,Drama|Romance,Leonardo DiCaprio,The Great Gatsby,362912,753.0,English,105.0,2013.0,7.3,115000,39.812796,Drama,Romance
97,Christopher Nolan,642.0,292.568851,Action|Adventure|Sci-Fi|Thriller,Leonardo DiCaprio,Inception,1468200,2803.0,English,160.0,2010.0,8.8,175000,132.568851,Action,Adventure|Sci-Fi|Thriller
179,Alejandro G. Iñárritu,556.0,183.635922,Adventure|Drama|Thriller|Western,Leonardo DiCaprio,The Revenant,406020,1188.0,English,135.0,2015.0,8.1,190000,48.635922,Adventure,Drama|Thriller|Western
257,Martin Scorsese,267.0,102.608827,Biography|Drama,Leonardo DiCaprio,The Aviator,264318,799.0,English,110.0,2004.0,7.5,0,-7.391173,Biography,Drama
296,Quentin Tarantino,765.0,162.804648,Drama|Western,Leonardo DiCaprio,Django Unchained,955174,1193.0,English,100.0,2012.0,8.5,199000,62.804648,Drama,Western
307,Edward Zwick,166.0,57.366262,Adventure|Drama|Thriller,Leonardo DiCaprio,Blood Diamond,400292,657.0,English,100.0,2006.0,8.0,14000,-42.633738,Adventure,Drama|Thriller
308,Martin Scorsese,606.0,116.866727,Biography|Comedy|Crime|Drama,Leonardo DiCaprio,The Wolf of Wall Street,780588,1138.0,English,100.0,2013.0,8.2,138000,16.866727,Biography,Comedy|Crime|Drama
326,Martin Scorsese,233.0,77.679638,Crime|Drama,Leonardo DiCaprio,Gangs of New York,314033,1166.0,English,100.0,2002.0,7.5,0,-22.320362,Crime,Drama
361,Martin Scorsese,352.0,132.373442,Crime|Drama|Thriller,Leonardo DiCaprio,The Departed,873649,2054.0,English,90.0,2006.0,8.5,29000,42.373442,Crime,Drama|Thriller


In [28]:
# Creating the Brad_Pitt dataframe
Brad_Pitt = movies[movies['actor_1_name'] == 'Brad Pitt']

# Displaying the Brad_Pitt dataframe
Brad_Pitt


Unnamed: 0,director_name,num_critic_for_reviews,gross,genres,actor_1_name,movie_title,num_voted_users,num_user_for_reviews,language,budget,title_year,imdb_score,movie_facebook_likes,profit,genre_1,genre_2
101,David Fincher,362.0,127.490802,Drama|Fantasy|Romance,Brad Pitt,The Curious Case of Benjamin Button,459346,822.0,English,150.0,2008.0,7.8,23000,-22.509198,Drama,Fantasy|Romance
147,Wolfgang Petersen,220.0,133.228348,Adventure,Brad Pitt,Troy,381672,1694.0,English,175.0,2004.0,7.2,0,-41.771652,Adventure,Adventure
254,Steven Soderbergh,198.0,125.531634,Crime|Thriller,Brad Pitt,Ocean's Twelve,284852,627.0,English,110.0,2004.0,6.4,0,15.531634,Crime,Thriller
255,Doug Liman,233.0,186.336103,Action|Comedy|Crime|Romance|Thriller,Brad Pitt,Mr. & Mrs. Smith,348861,798.0,English,120.0,2005.0,6.5,0,66.336103,Action,Comedy|Crime|Romance|Thriller
382,Tony Scott,142.0,0.026871,Action|Crime|Thriller,Brad Pitt,Spy Game,121259,361.0,English,92.0,2001.0,7.0,0,-91.973129,Action,Crime|Thriller
400,Steven Soderbergh,186.0,183.405771,Crime|Thriller,Brad Pitt,Ocean's Eleven,402645,845.0,English,85.0,2001.0,7.8,0,98.405771,Crime,Thriller
470,David Ayer,406.0,85.707116,Action|Drama|War,Brad Pitt,Fury,303185,701.0,English,68.0,2014.0,7.6,82000,17.707116,Action,Drama|War
611,Jean-Jacques Annaud,76.0,37.901509,Adventure|Biography|Drama|History|War,Brad Pitt,Seven Years in Tibet,96385,119.0,English,70.0,1997.0,7.0,0,-32.098491,Adventure,Biography|Drama|History|War
683,David Fincher,315.0,37.023395,Drama,Brad Pitt,Fight Club,1347461,2968.0,English,63.0,1999.0,8.8,48000,-25.976605,Drama,Drama
792,Patrick Gilmore,98.0,26.28832,Adventure|Animation|Comedy|Drama|Family|Fantas...,Brad Pitt,Sinbad: Legend of the Seven Seas,36144,91.0,English,60.0,2003.0,6.7,880,-33.71168,Adventure,Animation|Comedy|Drama|Family|Fantasy|Romance


In [30]:
# Combining the three dataframes into one dataframe
Combined = pd.concat([Meryl_Streep, Leo_Caprio, Brad_Pitt])

# Displaying the Combined dataframe
Combined


Unnamed: 0,director_name,num_critic_for_reviews,gross,genres,actor_1_name,movie_title,num_voted_users,num_user_for_reviews,language,budget,title_year,imdb_score,movie_facebook_likes,profit,genre_1,genre_2
410,Nancy Meyers,187.0,112.70347,Comedy|Drama|Romance,Meryl Streep,It's Complicated,69860,214.0,English,85.0,2009.0,6.6,0,27.70347,Comedy,Drama|Romance
1106,Curtis Hanson,42.0,46.815748,Action|Adventure|Crime|Thriller,Meryl Streep,The River Wild,32544,69.0,English,45.0,1994.0,6.3,0,1.815748,Action,Adventure|Crime|Thriller
1204,Nora Ephron,252.0,94.125426,Biography|Drama|Romance,Meryl Streep,Julie & Julia,79264,277.0,English,40.0,2009.0,7.0,13000,54.125426,Biography,Drama|Romance
1408,David Frankel,208.0,124.732962,Comedy|Drama|Romance,Meryl Streep,The Devil Wears Prada,286178,631.0,English,35.0,2006.0,6.8,0,89.732962,Comedy,Drama|Romance
1483,Robert Redford,227.0,14.99807,Drama|Thriller|War,Meryl Streep,Lions for Lambs,41170,298.0,English,35.0,2007.0,6.2,0,-20.00193,Drama,Thriller|War
1575,Sydney Pollack,66.0,87.1,Biography|Drama|Romance,Meryl Streep,Out of Africa,52339,200.0,English,31.0,1985.0,7.2,0,56.1,Biography,Drama|Romance
1618,David Frankel,234.0,63.536011,Comedy|Drama|Romance,Meryl Streep,Hope Springs,34258,178.0,English,30.0,2012.0,6.3,0,33.536011,Comedy,Drama|Romance
1674,Carl Franklin,64.0,23.20944,Drama,Meryl Streep,One True Thing,9283,112.0,English,30.0,1998.0,7.0,592,-6.79056,Drama,Drama
1752,Stephen Frears,87.0,,Biography|Comedy|Drama|Music|Romance,Meryl Streep,Florence Foster Jenkins,2167,32.0,English,29.0,2016.0,7.1,0,,Biography,Comedy|Drama|Music|Romance
1925,Stephen Daldry,174.0,41.59783,Drama|Romance,Meryl Streep,The Hours,102123,660.0,English,25.0,2002.0,7.6,0,16.59783,Drama,Romance


In [32]:
# Grouping the combined dataframe by 'actor_1_name'
combine_df = Combined.groupby('actor_1_name')

# Displaying the grouped dataframe
combine_df


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000021FDA7AE1E0>

In [33]:
# Finding the mean of critic reviews and audience reviews
cri_rev = combine_df['num_critic_for_reviews'].mean().sort_values(ascending=False)
print(cri_rev)

aud_rev = combine_df['num_user_for_reviews'].mean().sort_values(ascending=False)
print(aud_rev)


actor_1_name
Leonardo DiCaprio    330.190476
Brad Pitt            231.944444
Meryl Streep         163.153846
Name: num_critic_for_reviews, dtype: float64
actor_1_name
Leonardo DiCaprio    914.476190
Brad Pitt            702.444444
Meryl Streep         257.307692
Name: num_user_for_reviews, dtype: float64
