## Importing Dependencies

In [1]:
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from matplotlib.pyplot import figure
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8)

import pandas as pd
import seaborn as sns
import numpy as np

## Reading in CSV

In [2]:
# Read CSV

movies_df = pd.read_csv('data/movies.csv')
movies_df

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7663,More to Life,,Drama,2020,"October 23, 2020 (United States)",3.1,18.0,Joseph Ebanks,Joseph Ebanks,Shannon Bond,United States,7000.0,,,90.0
7664,Dream Round,,Comedy,2020,"February 7, 2020 (United States)",4.7,36.0,Dusty Dukatz,Lisa Huston,Michael Saquella,United States,,,Cactus Blue Entertainment,90.0
7665,Saving Mbango,,Drama,2020,"April 27, 2020 (Cameroon)",5.7,29.0,Nkanya Nkwai,Lynno Lovert,Onyama Laura,United States,58750.0,,Embi Productions,
7666,It's Just Us,,Drama,2020,"October 1, 2020 (United States)",,,James Randall,James Randall,Christina Roz,United States,15000.0,,,120.0


In [3]:
movies_df.columns

Index(['name', 'rating', 'genre', 'year', 'released', 'score', 'votes',
       'director', 'writer', 'star', 'country', 'budget', 'gross', 'company',
       'runtime'],
      dtype='object')

## Identifying and amending null values

In [4]:
# Checking for missing data

for col in movies_df.columns:
    pct_missing = np.mean(movies_df[col].isnull())
    print('{} - {}%'.format(col, pct_missing))

name - 0.0%
rating - 0.010041731872717789%
genre - 0.0%
year - 0.0%
released - 0.0002608242044861763%
score - 0.0003912363067292645%
votes - 0.0003912363067292645%
director - 0.0%
writer - 0.0003912363067292645%
star - 0.00013041210224308815%
country - 0.0003912363067292645%
budget - 0.2831246739697444%
gross - 0.02464788732394366%
company - 0.002217005738132499%
runtime - 0.0005216484089723526%


In [5]:
# Checking all columns with null values in dataFrame

print("Total Number of null values in the DataFrame: " + str(movies_df.isnull().sum().sum()))
print("Breakdown of null values by column: " + str(movies_df.isnull().sum()))

Total Number of null values in the DataFrame: 2473
Breakdown of null values by column: name           0
rating        77
genre          0
year           0
released       2
score          3
votes          3
director       0
writer         3
star           1
country        3
budget      2171
gross        189
company       17
runtime        4
dtype: int64


In [6]:
# Checking null values by a single column

print("Number of null values in column 'budget': " + str(movies_df.iloc[:, 11].isnull().sum()))
print("Number of null values in column 'gross': " + str(movies_df.iloc[:, 12].isnull().sum()))

Number of null values in column 'budget': 2171
Number of null values in column 'gross': 189


In [7]:
# Dropping rows with null values

movies_df = movies_df.dropna()

# Re-checking if NANs dropped

for col in movies_df.columns:
    pct_missing = np.mean(movies_df[col].isnull())
    print('{} - {}%'.format(col, pct_missing))

name - 0.0%
rating - 0.0%
genre - 0.0%
year - 0.0%
released - 0.0%
score - 0.0%
votes - 0.0%
director - 0.0%
writer - 0.0%
star - 0.0%
country - 0.0%
budget - 0.0%
gross - 0.0%
company - 0.0%
runtime - 0.0%


In [8]:
# Modifying data type to remove decimal place

movies_df['budget'] = movies_df['budget'].astype('int64')
movies_df['gross'] = movies_df['gross'].astype('int64')
movies_df['runtime'] = movies_df['runtime'].astype('int64')
movies_df['votes'] = movies_df['votes'].astype('int64')
movies_df.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[r

name         object
rating       object
genre        object
year          int64
released     object
score       float64
votes         int64
director     object
writer       object
star         object
country      object
budget        int64
gross         int64
company      object
runtime       int64
dtype: object

In [9]:
# Correcting mismatch between 'year' column and 'released' column
# Splitting 'released' column after each delimiter

movies_df = movies_df.join(movies_df['released'].str.split(expand=True))
movies_df.head(1)

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,...,budget,gross,company,runtime,0,1,2,3,4,5
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000,Stanley Kubrick,Stephen King,Jack Nicholson,...,19000000,46998772,Warner Bros.,146,June,13,1980,(United,States),


In [10]:
movies_df["Country Released"] = movies_df[3].combine(movies_df[4], lambda a, b: ((a or "") + (b or  "")) or None, None)
movies_df["Country Released"].value_counts()

(UnitedStates)     5066
(UnitedKingdom)      86
(France)             41
(Germany)            25
(Spain)              14
(Australia)          13
(Canada)             13
(Japan)              11
(SouthKorea)         10
(Italy)              10
(Denmark)             9
(Brazil)              9
States)               7
(Argentina)           7
(Israel)              6
(China)               6
(Netherlands)         6
(Russia)              6
(Greece)              5
(Norway)              4
(Singapore)           4
(HongKong)            4
(Iceland)             4
(Ireland)             4
(SouthAfrica)         4
(India)               3
(Mexico)              3
(Taiwan)              3
(Sweden)              3
(Bulgaria)            2
(Philippines)         2
(Turkey)              2
(Croatia)             2
(Belgium)             2
(Poland)              2
(Lebanon)             2
(Portugal)            2
(Thailand)            1
(Bahamas)             1
(Austria)             1
(Hungary)             1
(NewZealand)    

In [11]:
#

movies_df["Country Released"] = movies_df["Country Released"].replace(
    ['States)', 'Kingdom)','(UnitedArab','(SouthKorea)','(SouthAfrica)','(NewZealand)','(CzechRepublic)'],
    ['(UnitedStates)','(UnitedKingdom)','(United Arab Emirates)','(South Korea)','(South Africa)','(New Zealand)','(Czech Republic)'])



In [12]:
#

movies_df["Country Released"] = movies_df["Country Released"].str.extract('.*\((.*)\).*')

In [13]:
movies_df["Country Released"] = movies_df["Country Released"].replace(['UnitedStates','UnitedKingdom','HongKong'],
                                                                      ['United States','United Kingdom','Hong Kong'])
movies_df["Country Released"].value_counts()

United States           5073
United Kingdom            87
France                    41
Germany                   25
Spain                     14
Canada                    13
Australia                 13
Japan                     11
South Korea               10
Italy                     10
Brazil                     9
Denmark                    9
Argentina                  7
Russia                     6
China                      6
Netherlands                6
Israel                     6
Greece                     5
Iceland                    4
Norway                     4
Singapore                  4
South Africa               4
Hong Kong                  4
Ireland                    4
Taiwan                     3
Mexico                     3
India                      3
Sweden                     3
Portugal                   2
Belgium                    2
Lebanon                    2
Philippines                2
Bulgaria                   2
Poland                     2
Turkey        

In [14]:
#

movies_df = movies_df.drop(columns=[0,1,3,4,5,'year','released'])
movies_df = movies_df.rename(columns={2: 'Year Released'})


movies_df.columns = movies_df.columns.str.capitalize()
movies_df.head(1)

Unnamed: 0,Name,Rating,Genre,Score,Votes,Director,Writer,Star,Country,Budget,Gross,Company,Runtime,Year released,Country released
0,The Shining,R,Drama,8.4,927000,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000,46998772,Warner Bros.,146,1980,United States


In [15]:
# # Sorting values to see the top 5 by 'gross'
# movies_df.sort_values(by=['gross'], inplace=False, ascending=False).head(5)