In [40]:
import pandas as pd
import numpy as np

In [41]:
df = pd.read_csv('movies_final.csv', parse_dates=['release_date'], low_memory=False)

In [42]:
df.head(2)

Unnamed: 0,belongs_to_collection,budget_mio,genres,id,original_language,overview,popularity,poster_path,production_companies,production_countries,...,spoken_languages,status,tagline,title,vote_average,vote_count,cast,cast_size,crew_size,director
0,Toy Story Collection,30.0,"Animation,Comedy,Family",862.0,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,http://image.tmdb.org/t/p/w185//rhIRbceoE9lR4v...,Pixar Animation Studios,United States of America,...,English,Released,,Toy Story,7.7,5415.0,"Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wal...",13.0,106.0,John Lasseter
1,,65.0,"Adventure,Fantasy,Family",8844.0,en,When siblings Judy and Peter discover an encha...,17.015539,http://image.tmdb.org/t/p/w185//vzmL6fP7aPKNKP...,"TriStar Pictures,Teitler Film,Interscope Commu...",United States of America,...,"English,Français",Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,"Robin Williams,Jonathan Hyde,Kirsten Dunst,Bra...",26.0,16.0,Joe Johnston


Now we can finally start to analyze the movie data

### The Best and Worst Movies

- Highest Revenue  --> Avatar, 10.12.2009
- Highest Budget  --> Pirates of the Caribbean: On Stranger Tides, 14.05.2011
- Highest Profit  --> Avatar, 10.12.2009
- Lowest Profit  --> The Lone Ranger, 03.07.2013
- Highest Return on Investment  --> budget more than 30 mio. USD, The Passion of the Christ, 25.02.2004
- Lowest Return on Investment --> budget more than 30 mio. USD, Foodfight!, 15.06.2012
- Highest number of Votes  --> Inception, 14.07.2010
- Highest Rating --> vote count of min 1000, The Shawshank Redemption, 23.09.1994
- Lowest Rating --> vote count of min 1000, The Boy Next Door, 23.01.2015
- Highest Popularity --> Minions, 17.06.2015

In [44]:
df.loc[df['revenue_mio']==df['revenue_mio'].max(),['title', 'release_date']]


Unnamed: 0,title,release_date
14546,Avatar,2009-12-10


In [45]:
df.loc[df['budget_mio']==df['budget_mio'].max(),['title', 'release_date']]

Unnamed: 0,title,release_date
17116,Pirates of the Caribbean: On Stranger Tides,2011-05-14


In [48]:
df['profit_mio'] = df['revenue_mio'] - df['budget_mio']

In [49]:
df.loc[df['profit_mio']==df['profit_mio'].max(),['title', 'release_date']]

Unnamed: 0,title,release_date
14546,Avatar,2009-12-10


In [51]:
df.loc[df['profit_mio']==df['profit_mio'].min(),['title', 'release_date']]

Unnamed: 0,title,release_date
21159,The Lone Ranger,2013-07-03


For the return of investment (ROI) we will set the minimum budget as a variable

In [93]:
def roi(min_budget, x):
    df1 = df.copy()
    df1 = df1.loc[df['budget_mio']>=min_budget, :]
    df1['roi'] = df1['revenue_mio']/df1['budget_mio']
    if x == 'highest':
        return df1.loc[df1['roi'] == df1['roi'].max(), ['title', 'release_date']]
    elif x== 'lowest':
        return df1.loc[df1['roi'] == df1['roi'].min(), ['title', 'release_date']]
    else:
        print('Invalid Input')

In [89]:
roi(30, 'highest')

Unnamed: 0,title,release_date
7164,The Passion of the Christ,2004-02-25


In [90]:
roi(30, 'lowest')

Unnamed: 0,title,release_date
25711,Foodfight!,2012-06-15


In [92]:
df.loc[df['vote_count'] == df['vote_count'].max(), ['title', 'release_date', 'vote_count', 'vote_average']]

Unnamed: 0,title,release_date,vote_count,vote_average
15474,Inception,2010-07-14,14075.0,8.1


In [103]:
def vote(min, x):
    df2 = df.copy()
    df2 = df2.loc[df2['vote_count'] >= min, :]
    if x == 'highest':
        return df2.loc[df2['vote_average']== df2['vote_average'].max(), ['title', 'release_date', 'vote_average']]
    elif x == 'lowest':
        return df2.loc[(df2['vote_average']== df2['vote_average'].min()), ['title', 'release_date', 'vote_average']]
    else:
        print('Ivalid Input!')

In [109]:
vote(1000, 'highest')

Unnamed: 0,title,release_date,vote_average
314,The Shawshank Redemption,1994-09-23,8.5
834,The Godfather,1972-03-14,8.5
40217,Your Name.,2016-08-26,8.5


In [110]:
vote(1000, 'lowest')

Unnamed: 0,title,release_date,vote_average
28185,The Boy Next Door,2015-01-23,4.1


In [114]:
df.loc[df['popularity'] == df['popularity'].max(), ['title', 'release_date', 'popularity']]

Unnamed: 0,title,release_date,popularity
30673,Minions,2015-06-17,547.488298


### Finding movies

- Science Fiction Action Movie with Michael Keaton (sorted from high to low Rating)
- Movies with Uma Thurman and directed by Quentin Tarantino (sorted from short to long runtime)
- Most Successful Pixar Studio Movies between 2010 and 2015 (sorted from high to low Revenue)
- Action or Thriller Movie with original language English and minimum Rating of 7.5 (most recent movies first)

In [129]:
df[df['cast'].str.contains('Michael Keaton', na=False)].sort_values(by='vote_average', ascending=False)[['title', 'vote_average', 'release_date']]

Unnamed: 0,title,vote_average,release_date
33014,Spotlight,7.8,2015-11-06
15342,Toy Story 3,7.6,2010-06-16
23541,Birdman,7.4,2014-08-27
1646,Jackie Brown,7.3,1997-12-24
492,Much Ado About Nothing,7.2,1993-05-07
43796,Touch and Go,7.1,1986-08-22
2064,Beetlejuice,7.1,1988-02-29
585,Batman,7.0,1989-06-23
41619,The Founder,7.0,2016-11-24
24501,Hawaiian Vacation,6.9,2011-06-16


In [130]:
df[(df['cast'].str.contains('Uma Thurman', na=False)) & (df['director']=='Quentin Tarantino')].sort_values(by='runtime', ascending=True)[['title', 'runtime', 'release_date']]

Unnamed: 0,title,runtime,release_date
6724,Kill Bill: Vol. 1,111.0,2003-10-10
7270,Kill Bill: Vol. 2,136.0,2004-04-16
292,Pulp Fiction,154.0,1994-09-10


In [137]:
df[(df['production_companies']=='Pixar Animation Studios') & (df['release_date']>='01-01-2010') & (df['release_date']<='12-31-2015')].sort_values(by='revenue_mio', ascending=False)[['title', 'revenue_mio', 'release_date']]

Unnamed: 0,title,revenue_mio,release_date
21908,The Blue Umbrella,,2013-02-12
22713,La luna,,2011-01-01
24501,Hawaiian Vacation,,2011-06-16
24503,Small Fry,,2011-11-23
25778,The Legend of Mor'du,,2012-01-01
25779,Toy Story That Time Forgot,,2014-12-02
32187,Lava,,2015-06-21
35000,Sanjay's Super Team,,2015-11-25
40855,Air Mater,,2011-08-04


In [138]:
df.head(2)

Unnamed: 0,belongs_to_collection,budget_mio,genres,id,original_language,overview,popularity,poster_path,production_companies,production_countries,...,status,tagline,title,vote_average,vote_count,cast,cast_size,crew_size,director,profit_mio
0,Toy Story Collection,30.0,"Animation,Comedy,Family",862.0,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,http://image.tmdb.org/t/p/w185//rhIRbceoE9lR4v...,Pixar Animation Studios,United States of America,...,Released,,Toy Story,7.7,5415.0,"Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wal...",13.0,106.0,John Lasseter,343.554033
1,,65.0,"Adventure,Fantasy,Family",8844.0,en,When siblings Judy and Peter discover an encha...,17.015539,http://image.tmdb.org/t/p/w185//vzmL6fP7aPKNKP...,"TriStar Pictures,Teitler Film,Interscope Commu...",United States of America,...,Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,"Robin Williams,Jonathan Hyde,Kirsten Dunst,Bra...",26.0,16.0,Joe Johnston,197.797249


In [141]:
df[(df['genres'].str.contains('Action')|df['genres'].str.contains('Thriller')) & (df['original_language']=='en') & (df['vote_average']>=7.5)].sort_values(by='release_date', ascending=False)[['title', 'vote_average', 'release_date']]

Unnamed: 0,title,vote_average,release_date
45219,Descendants 2,7.5,2017-07-21
44642,Dunkirk,7.5,2017-07-19
43282,The Book of Henry,7.6,2017-06-16
26545,Guardians of the Galaxy Vol. 2,7.6,2017-04-19
44155,Revengeance,8.0,2017-04-05
...,...,...,...
8334,Scarface,7.5,1932-04-09
8321,"Steamboat Bill, Jr.",7.9,1928-02-14
2905,The General,8.0,1926-12-31
34987,Bardelys the Magnificent,8.0,1926-09-30


### Franchise movies

Franchise means that movies with 'belongs_to_collection'

- mean revenue
- median Return on Investment
- mean budget raised
- mean popularity
- mean rating
- total number of movies
- total & mean budget
- total & mean revenue

In [None]:
df['roi'] = df['revenue_mio']/df['budget_mio']

In [197]:
def franchise(col,agg):
    try:
        return df.groupby('belongs_to_collection')[col].agg(agg).nlargest(5)
    except TypeError:
        return df.groupby('belongs_to_collection')[col].agg(agg).nlargest(5, columns=agg[0])

In [199]:
franchise('revenue_mio', 'mean')

belongs_to_collection
Avatar Collection          2787.965087
The Avengers Collection    1462.480802
Frozen Collection          1274.219009
Finding Nemo Collection     984.453213
The Hobbit Collection       978.507785
Name: revenue_mio, dtype: float64

In [200]:
franchise('roi', 'median')

belongs_to_collection
Blair Witch Collection                       2071.183966
Super Size Me Collection                      439.616585
Open Water Collection                         420.522723
Bambi Collection                              311.709965
The Hills Have Eyes (Original) Collection     108.695652
Name: roi, dtype: float64

In [201]:
franchise('budget_mio', 'sum')

belongs_to_collection
James Bond Collection                  1539.65
Harry Potter Collection                1280.00
Pirates of the Caribbean Collection    1250.00
The Fast and the Furious Collection    1009.00
X-Men Collection                        983.00
Name: budget_mio, dtype: float64

In [202]:
franchise('popularity', 'mean')

belongs_to_collection
Wonder Woman Collection               294.337037
Deadpool Collection                   187.860492
Guardians of the Galaxy Collection    119.311296
John Wick Collection                  116.558939
Despicable Me Collection              106.715961
Name: popularity, dtype: float64

In [203]:
franchise('vote_average', 'mean')

belongs_to_collection
Argo Collection                        9.3
Bloodfight                             9.0
Dreileben                              9.0
Kenji Misumi's Trilogy of the Sword    9.0
Алиса в стране чудес (Коллекция)       8.7
Name: vote_average, dtype: float64

In [204]:
franchise('id', 'count')

belongs_to_collection
The Bowery Boys                  29
Totò Collection                  27
James Bond Collection            26
Zatôichi: The Blind Swordsman    26
The Carry On Collection          25
Name: id, dtype: int64

In [198]:
franchise('budget_mio', ['mean','sum'])

Unnamed: 0_level_0,mean,sum
belongs_to_collection,Unnamed: 1_level_1,Unnamed: 2_level_1
Tangled Collection,260.0,260.0
Pirates of the Caribbean Collection,250.0,1250.0
The Avengers Collection,250.0,500.0
The Hobbit Collection,250.0,750.0
Man of Steel Collection,237.5,475.0


In [205]:
franchise('revenue_mio', ['mean','sum'])

Unnamed: 0_level_0,mean,sum
belongs_to_collection,Unnamed: 1_level_1,Unnamed: 2_level_1
Avatar Collection,2787.965087,2787.965087
The Avengers Collection,1462.480802,2924.961604
Frozen Collection,1274.219009,1274.219009
Finding Nemo Collection,984.453213,1968.906425
The Hobbit Collection,978.507785,2935.523356


### Director analysis

- total number of movies
- total revenue
- mean rating

In [207]:
df.groupby('director')['id'].count().nlargest(5)

director
John Ford           63
Michael Curtiz      61
Alfred Hitchcock    52
Werner Herzog       52
Georges Méliès      51
Name: id, dtype: int64

In [208]:
df.groupby('director')['revenue_mio'].sum().nlargest(5)

director
Steven Spielberg    9227.170503
Peter Jackson       6528.244659
Michael Bay         6437.466781
James Cameron       5900.610310
David Yates         5334.563196
Name: revenue_mio, dtype: float64

In [209]:
df.groupby('director')['vote_average'].mean().nlargest(5)

director
A.W. Vidmer                       10.0
Amy Schatz                        10.0
Ana Poliak                        10.0
Andrew BowserJoseph M. Petrick    10.0
Andrew Napier                     10.0
Name: vote_average, dtype: float64