## Importing Libraries

In [5]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import figure

import pandas_profiling

## Importing the data

In [6]:
!curl --remote-name \
     -H 'Accept: application/vnd.github.v3.raw' \
     --location https://raw.githubusercontent.com/antunes-lima/Python-Movie-Industry-Analysis/main/movies.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 1320k  100 1320k    0     0  10.0M      0 --:--:-- --:--:-- --:--:-- 10.0M


In [7]:
df = pd.read_csv('/content/movies.csv')

df.sample(10)

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
386,Easy Money,R,Comedy,1983,"August 19, 1983 (United States)",6.3,8200.0,James Signorelli,Rodney Dangerfield,Rodney Dangerfield,United States,,29309766.0,Easy Money Associates,95.0
5571,All About Steve,PG-13,Comedy,2009,"September 4, 2009 (United States)",4.8,40000.0,Phil Traill,Kim Barker,Sandra Bullock,United States,15000000.0,40105542.0,Fox 2000 Pictures,99.0
2274,Alive,R,Biography,1993,"January 15, 1993 (United States)",7.1,53000.0,Frank Marshall,Piers Paul Read,Ethan Hawke,United States,32000000.0,36733909.0,Film Andes S.A.,120.0
3671,Sexy Beast,R,Crime,2000,"July 13, 2001 (United States)",7.3,57000.0,Jonathan Glazer,Louis Mellis,Ray Winstone,United Kingdom,,10045677.0,Recorded Picture Company (RPC),89.0
3959,An American Rhapsody,PG-13,Drama,2001,"February 28, 2002 (Hungary)",6.7,3900.0,Éva Gárdos,Éva Gárdos,Nastassja Kinski,United States,,970676.0,Fireworks Pictures,106.0
3463,Stuart Little,PG,Adventure,1999,"December 17, 1999 (United States)",5.9,128000.0,Rob Minkoff,E.B. White,Michael J. Fox,Germany,133000000.0,300135367.0,Columbia Pictures,84.0
1110,Hamburger Hill,R,Action,1987,"August 28, 1987 (United States)",6.7,24000.0,John Irvin,James Carabatsos,Anthony Barrile,United States,,13839404.0,RKO Pictures,110.0
2375,The Meteor Man,PG,Action,1993,"August 6, 1993 (United States)",5.2,7800.0,Robert Townsend,Robert Townsend,Robert Townsend,United States,20000000.0,8016708.0,Metro-Goldwyn-Mayer (MGM),100.0
6904,Miss Sloane,R,Drama,2016,"December 9, 2016 (United States)",7.5,68000.0,John Madden,Jonathan Perera,Jessica Chastain,France,13000000.0,9101546.0,EuropaCorp,132.0
1039,Nobody's Fool,PG-13,Comedy,1986,"November 7, 1986 (United States)",5.5,754.0,Evelyn Purcell,Beth Henley,Rosanna Arquette,United States,,563358.0,Island Pictures,107.0


In [8]:
#!pip install pandas-profiling==2.7.1

In [9]:
#from pandas_profiling import ProfileReport

#profile_df = ProfileReport(df)

#profile_df.to_notebook_iframe()

In [10]:
# checking data types

df.dtypes

name         object
rating       object
genre        object
year          int64
released     object
score       float64
votes       float64
director     object
writer       object
star         object
country      object
budget      float64
gross       float64
company      object
runtime     float64
dtype: object

In [11]:
# checking df shape (rows, columns)

df.shape

(7668, 15)

In [12]:
# checking how many null values on each column

df.isnull().sum(axis=0)

name           0
rating        77
genre          0
year           0
released       2
score          3
votes          3
director       0
writer         3
star           1
country        3
budget      2171
gross        189
company       17
runtime        4
dtype: int64

In [13]:
# checking percent of missing values

for col in df.columns:
  missing = round(np.mean(df[col].isnull())*100, 2)
  print('{} - {}%'.format(col, missing))

name - 0.0%
rating - 1.0%
genre - 0.0%
year - 0.0%
released - 0.03%
score - 0.04%
votes - 0.04%
director - 0.0%
writer - 0.04%
star - 0.01%
country - 0.04%
budget - 28.31%
gross - 2.46%
company - 0.22%
runtime - 0.05%


In [14]:
# checking how many unique values on each column

df.nunique()

name        7512
rating        12
genre         19
year          41
released    3414
score         72
votes        936
director    2949
writer      4535
star        2814
country       59
budget       413
gross       7472
company     2385
runtime      138
dtype: int64

In [16]:
# some films have year != release date year
# spliting the released column and creating a new column with the year from released

df['year_release'] = df['released'].astype('str').str.split(n=4, expand=True)[2]

df.sample(10)

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime,year_release
3275,Can't Hardly Wait,PG-13,Comedy,1998,"June 12, 1998 (United States)",6.5,47000.0,Harry Elfont,Deborah Kaplan,Jennifer Love Hewitt,United States,10000000.0,25605015.0,Columbia Pictures,100.0,1998
4867,Inside Man,R,Crime,2006,"March 24, 2006 (United States)",7.6,350000.0,Spike Lee,Russell Gewirtz,Denzel Washington,United States,45000000.0,186003591.0,Universal Pictures,129.0,2006
4014,Lantana,R,Drama,2001,"March 8, 2002 (United States)",7.3,19000.0,Ray Lawrence,Andrew Bovell,Anthony LaPaglia,Australia,,15747450.0,MBP (Germany),121.0,2002
5282,21,PG-13,Crime,2008,"March 28, 2008 (United States)",6.8,235000.0,Robert Luketic,Peter Steinfeld,Jim Sturgess,United States,35000000.0,159808370.0,Columbia Pictures,123.0,2008
5284,The Mummy: Tomb of the Dragon Emperor,PG-13,Action,2008,"August 1, 2008 (United States)",5.2,154000.0,Rob Cohen,Alfred Gough,Brendan Fraser,United States,145000000.0,403449830.0,Universal Pictures,112.0,2008
2103,Damage,R,Drama,1992,"January 22, 1993 (United States)",6.8,17000.0,Louis Malle,David Hare,Jeremy Irons,United Kingdom,,7532911.0,Nouvelles Éditions de Films (NEF),111.0,1993
4144,Spy Kids 2: Island of Lost Dreams,PG,Action,2002,"August 7, 2002 (United States)",5.2,62000.0,Robert Rodriguez,Robert Rodriguez,Alexa PenaVega,United States,38000000.0,119723358.0,Dimension Films,100.0,2002
7136,All the Money in the World,R,Biography,2017,"December 25, 2017 (United States)",6.8,79000.0,Ridley Scott,David Scarpa,Michelle Williams,United States,50000000.0,56996304.0,Imperative Entertainment,132.0,2017
3381,The Players Club,R,Comedy,1998,"April 8, 1998 (United States)",5.9,6400.0,Ice Cube,Ice Cube,LisaRaye McCoy,United States,4500000.0,23261485.0,New Line Cinema,104.0,1998
2457,Interview with the Vampire: the Vampire Chroni...,R,Drama,1994,"November 11, 1994 (United States)",7.5,301000.0,Neil Jordan,Anne Rice,Brad Pitt,United States,60000000.0,223664608.0,Geffen Pictures,123.0,1994


In [17]:
df.year_release.value_counts(dropna=False)

2019           227
2015           213
2007           212
2011           211
1986           208
2003           207
2001           205
2018           205
1998           205
2008           204
2017           203
1995           202
1994           202
1993           200
2005           200
2000           200
1996           199
2013           199
2016           198
1997           198
1989           197
2006           196
1999           196
2014           196
1988           193
2002           193
1987           192
2012           192
2010           192
2009           192
1991           191
1990           191
2004           191
1992           181
1985           180
1984           156
1983           143
1982           126
1981           102
1980            79
(United         45
2020            32
NaN              6
States)          6
(Australia)      2
Name: year_release, dtype: int64

In [18]:
df.sort_values(by=['year_release'], inplace=False, ascending=True)

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime,year_release
5958,The Human Centipede II (Full Sequence),Not Rated,Horror,2011,November 2011 (Australia),3.8,37000.0,Tom Six,Tom Six,Laurence R. Harvey,Netherlands,,170323.0,Six Entertainment Company,91.0,(Australia)
5833,Hatchet II,R,Action,2010,November 2010 (Australia),5.5,13000.0,Adam Green,Adam Green,Danielle Harris,United States,800000.0,156190.0,ArieScope Pictures,85.0,(Australia)
463,Slayground,R,Crime,1983,February 1984 (United States),4.9,360.0,Terry Bedford,Trevor Preston,Peter Coyote,United Kingdom,,108128.0,Jennie and Company,89.0,(United
2029,Liebestraum,R,Mystery,1991,November 1991 (United States),5.9,1500.0,Mike Figgis,Mike Figgis,Kevin Anderson,United States,6900000.0,133645.0,Initial Entertainment Group (IEG),112.0,(United
1735,The Comfort of Strangers,R,Drama,1990,April 1991 (United States),6.3,5000.0,Paul Schrader,Ian McEwan,Christopher Walken,United States,,1244381.0,Erre Produzioni,107.0,(United
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
786,Taipei Story,Not Rated,Drama,1985,1985 (Taiwan),7.7,2500.0,Edward Yang,T'ien-wen Chu,Chin Tsai,Taiwan,,35336.0,Evergreen Film Company,119.0,
1825,Strangers in Good Company,PG,Drama,1990,1990 (Canada),7.7,995.0,Cynthia Scott,Gloria Demers,Alice Diabo,Canada,,,National Film Board of Canada (NFB),101.0,
2816,The White Balloon,Unrated,Drama,1995,1995 (Iran),7.7,6900.0,Jafar Panahi,Abbas Kiarostami,Aida Mohammadkhani,Iran,150000.0,924940.0,Farabi Cinema Foundation,85.0,
5728,Saw: The Final Chapter,R,Crime,2010,,5.6,93000.0,Kevin Greutert,Patrick Melton,Tobin Bell,,,,,,


Trying another method:

In [21]:
# searching for 4 digits in the release date using a regular expression, and using it as year of release
df['year_release'] = df['released'].str.extract(r'(\d{4})', expand=False)

df.sample(10)

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime,year_release
4429,Luther,PG-13,Biography,2003,"September 26, 2003 (United States)",6.6,15000.0,Eric Till,Camille Thomasson,Joseph Fiennes,Germany,30000000.0,29632684.0,Eikon Film,123.0,2003
3019,Two If by Sea,R,Comedy,1996,"January 12, 1996 (United States)",5.2,5500.0,Bill Bennett,Denis Leary,Sandra Bullock,United States,,10658278.0,Morgan Creek Entertainment,96.0,1996
6718,Joy,PG-13,Biography,2015,"December 25, 2015 (United States)",6.6,131000.0,David O. Russell,David O. Russell,Jennifer Lawrence,United States,60000000.0,101134059.0,Fox 2000 Pictures,124.0,2015
5178,Death Sentence,R,Action,2007,"August 31, 2007 (United States)",6.7,70000.0,James Wan,Ian Mackenzie Jeffers,Kevin Bacon,United States,20000000.0,16974459.0,Twentieth Century Fox,105.0,2007
7572,The Prodigy,R,Fantasy,2019,"February 8, 2019 (United States)",5.9,24000.0,Nicholas McCarthy,Jeff Buhler,Taylor Schilling,United States,6000000.0,21150265.0,Orion Pictures,92.0,2019
5632,Aliens in the Attic,PG,Adventure,2009,"July 31, 2009 (United States)",5.4,21000.0,John Schultz,Mark Burton,Ashley Tisdale,United States,45000000.0,57881056.0,Twentieth Century Fox,86.0,2009
219,Sophie's Choice,R,Drama,1982,"March 4, 1983 (United States)",7.6,44000.0,Alan J. Pakula,William Styron,Meryl Streep,United Kingdom,12000000.0,30036000.0,Incorporated Television Company (ITC),150.0,1983
6478,The Amazing Spider-Man 2,PG-13,Action,2014,"May 2, 2014 (United States)",6.6,427000.0,Marc Webb,Alex Kurtzman,Andrew Garfield,United States,200000000.0,708982323.0,Marvel Enterprises,142.0,2014
6928,London Has Fallen,R,Action,2016,"March 4, 2016 (United States)",5.9,149000.0,Babak Najafi,Creighton Rothenberger,Gerard Butler,United States,60000000.0,205754447.0,Millennium Films,99.0,2016
3567,Random Hearts,R,Drama,1999,"October 8, 1999 (United States)",5.2,20000.0,Sydney Pollack,Warren Adler,Harrison Ford,United States,64000000.0,74608570.0,Columbia Pictures,133.0,1999


In [22]:
df.year_release.value_counts(dropna=False)

2019    228
2015    213
2011    212
2007    212
1986    211
2003    207
1994    206
2001    205
2018    205
1998    205
2008    204
1991    204
2017    203
1995    203
2000    200
1993    200
2013    200
2005    200
1989    200
1996    199
1988    199
2016    198
1997    198
1990    197
2014    196
1987    196
2006    196
1999    196
2002    193
2010    193
2009    192
2012    192
2004    191
1992    184
1985    183
1984    157
1983    145
1982    128
1981    103
1980     80
2020     32
NaN       2
Name: year_release, dtype: int64