### Pandas - Reading in the data

In [1]:
%matplotlib inline
import pandas as pd

In [2]:
pd.__version__

'0.17.0'

In [3]:
#custom style HTML output

from IPython.core.display import HTML

csspath1 = r'C:\COURSERA\PYCON2015_TUTORIALS\Brandon Rhodes - Pandas From The Ground Up - PyCon 2015\style-table.css'
csspath2 = r'C:\COURSERA\PYCON2015_TUTORIALS\Brandon Rhodes - Pandas From The Ground Up - PyCon 2015\style-notebook.css'

css = open(csspath1).read() + open(csspath2).read()
HTML('<style>{}</style>'.format(css))

In [4]:
path1 = r'C:\COURSERA\PYCON2015_TUTORIALS\Brandon Rhodes - Pandas From The Ground Up - PyCon 2015\data\titles.csv'
path2 = r'C:\COURSERA\PYCON2015_TUTORIALS\Brandon Rhodes - Pandas From The Ground Up - PyCon 2015\data\cast.csv'

titles = pd.DataFrame.from_csv(path1 , index_col=None)
cast = pd.DataFrame.from_csv(path2, index_col=None)

In [5]:
cast.head(3) #head() is a property of both dataframe and series objects

Unnamed: 0,title,year,name,type,character,n
0,Suuri illusioni,1985,Homo $,actor,Guests,22.0
1,Gangsta Rap: The Glockumentary,2007,Too $hort,actor,Himself,
2,Menace II Society,1993,Too $hort,actor,Lew-Loc,27.0


### Pandas - Basics

In [6]:
len(titles)

211153

In [7]:
titles.head(3)

Unnamed: 0,title,year
0,Insane,2010
1,Dreamer,2011
2,Down in San Diego,1941


In [8]:
titles.tail(3)

Unnamed: 0,title,year
211150,Mater dolorosa,1917
211151,Xi ying chun,1949
211152,The Last Great Ride,1999


In [9]:
cast.head(3)  # n is the order of billing in the movie's credits

Unnamed: 0,title,year,name,type,character,n
0,Suuri illusioni,1985,Homo $,actor,Guests,22.0
1,Gangsta Rap: The Glockumentary,2007,Too $hort,actor,Himself,
2,Menace II Society,1993,Too $hort,actor,Lew-Loc,27.0


### Pandas - Extracting series from data frames

In [10]:
h = titles.head()

In [11]:
h['title'] #lookup one column and return as a series

0                     Insane
1                    Dreamer
2          Down in San Diego
3        Da Svante forsvandt
4    The Dark Eyes of London
Name: title, dtype: object

In [12]:
h['year'] #lookup one column and return as a series

0    2010
1    2011
2    1941
3    1975
4    1939
Name: year, dtype: int64

In [13]:
h['year'] + 1000

0    3010
1    3011
2    2941
3    2975
4    2939
Name: year, dtype: int64

In [14]:
h['year'] // 10 #// does division and chops the fraction

0    201
1    201
2    194
3    197
4    193
Name: year, dtype: int64

In [15]:
h['year'] // 10 * 10

0    2010
1    2010
2    1940
3    1970
4    1930
Name: year, dtype: int64

In [16]:
h['year'] > 1985

0     True
1     True
2    False
3    False
4    False
Name: year, dtype: bool

In [17]:
h.year > 1985

0     True
1     True
2    False
3    False
4    False
Name: year, dtype: bool

### Pandas - Filtering and sorting

In [18]:
h[h.year > 1985]

Unnamed: 0,title,year
0,Insane,2010
1,Dreamer,2011


In [19]:
h[(h.year < 1990) & (h.year >= 1950)] # and

Unnamed: 0,title,year
3,Da Svante forsvandt,1975


In [20]:
h[(h.year < 1970) | (h.year >= 1980)] # or

Unnamed: 0,title,year
0,Insane,2010
1,Dreamer,2011
2,Down in San Diego,1941
4,The Dark Eyes of London,1939


In [21]:
h[(h.year < 1970) | (h.year >= 1980)].sort_values('year')

Unnamed: 0,title,year
4,The Dark Eyes of London,1939
2,Down in San Diego,1941
0,Insane,2010
1,Dreamer,2011


In [22]:
h[h.year >= 1970].sort_values(['year','title']).head(3)

Unnamed: 0,title,year
3,Da Svante forsvandt,1975
0,Insane,2010
1,Dreamer,2011


In [23]:
h.year.sort_values() #sort_values works for series too

4    1939
2    1941
3    1975
0    2010
1    2011
Name: year, dtype: int64

In [24]:
h[h.year >= 1970].notnull()

Unnamed: 0,title,year
0,True,True
1,True,True
3,True,True


In [25]:
h.year.notnull() #notnull() works for series as well

0    True
1    True
2    True
3    True
4    True
Name: year, dtype: bool

In [26]:
x=10
x+20  #thowing the result away, don't throw yours out
x

10

In [27]:
h = cast.head()
h = h[h.n.isnull()]  #filter all NaNs in the n column
h

Unnamed: 0,title,year,name,type,character,n
1,Gangsta Rap: The Glockumentary,2007,Too $hort,actor,Himself,
4,Stop Pepper Palmer,2014,Too $hort,actor,Himself,


### How many movies are listed in the titles dataframe?

In [28]:
len(titles.title.isnull() == False)

211153

### What are the earliest two films listed in the titles dataframe?

In [29]:
titles.sort_values('year').head(2)

Unnamed: 0,title,year
113435,Miss Jerry,1894
65908,Reproduction of the Corbett and Fitzsimmons Fight,1897


### How many movies have the title "Hamlet"?

In [30]:
len(titles[titles.title == 'Hamlet'])

19

### How many movies are titled "North by Northwest"?

In [31]:
len(titles[titles.title == 'North by Northwest'])

1

### When was the first movie titled "Hamlet" made?

In [32]:
titles[titles.title == 'Hamlet'].sort_values('year').head(1)

Unnamed: 0,title,year
209812,Hamlet,1910


### List all of the "Treasure Island" movies from earliest to most recent.

In [33]:
titles[titles.title == 'Treasure Island'].sort_values('year')

Unnamed: 0,title,year
205346,Treasure Island,1918
102395,Treasure Island,1920
53039,Treasure Island,1934
210347,Treasure Island,1950
105787,Treasure Island,1972
58841,Treasure Island,1973
153850,Treasure Island,1985
40064,Treasure Island,1999


### How many movies were made in the year 1950?

In [34]:
len(titles[titles.year == 1950])

1033

### How many movies were made in the year 1960?

In [35]:
len(titles[titles.year == 1960])

1426

### How many movies were made from 1950 through 1959?

In [36]:
len(titles[(titles.year >= 1950) & (titles.year < 1960)])

12002

### In what years has a movie titled "Batman" been released?

In [37]:
titles.year[titles.title == 'Batman']

165226    1989
184020    1943
Name: year, dtype: int64

### How many roles were there in the movie "Inception"?

In [38]:
cast.character[cast.title == 'Inception'].head()

84209             LAX Passenger
128175    Fischer's Jet Captain
158851                 Browning
171039         Mombasan Gambler
225396           Bridge Sub Con
Name: character, dtype: object

In [39]:
len(cast.character[cast.title == 'Inception'])

72

### How many roles in the movie "Inception" are NOT ranked by an "n" value?

In [40]:
cast.head(3)

Unnamed: 0,title,year,name,type,character,n
0,Suuri illusioni,1985,Homo $,actor,Guests,22.0
1,Gangsta Rap: The Glockumentary,2007,Too $hort,actor,Himself,
2,Menace II Society,1993,Too $hort,actor,Lew-Loc,27.0


In [41]:
cast.character[(cast.title == 'Inception') & (cast.n.isnull())].head(3)

84209        LAX Passenger
171039    Mombasan Gambler
326847       LAX Passenger
Name: character, dtype: object

In [42]:
len(cast.character[(cast.title == 'Inception') & (cast.n.isnull())])

21

### But how many roles in the movie "Inception" did receive an "n" value?

In [43]:
len(cast.character[(cast.title == 'Inception')]) #total roles in Inception

72

In [44]:
len(cast.character[(cast.title == 'Inception') & ( cast.n.isnull() )])

21

In [45]:
len(cast.character[(cast.title == 'Inception') & ( cast.n.notnull() )]) #by using series from dataframe

51

In [46]:
len(cast[(cast.title == 'Inception') & (cast.n.notnull())]) #by dataframe

51

### Display the cast of "North by Northwest" in their correct "n"-value order, ignoring roles that did not earn a numeric "n" value.

In [47]:
len(cast[(cast.title == 'North by Northwest') & ( cast.n.notnull() )].sort_values('n'))

16

In [48]:
cast[(cast.title == 'North by Northwest') & ( cast.n.notnull() )].sort_values('n').name

763075               Cary Grant
3041432         Eva Marie Saint
1275610             James Mason
2737972     Jessie Royce Landis
310982           Leo G. Carroll
2648251    Josephine Hutchinson
1484903             Philip Ober
1115610           Martin Landau
2138426           Adam Williams
1585678            Edward Platt
582768        Robert Ellenstein
2006005            Les Tremayne
405616          Philip Coolidge
1321396           Patrick McVey
178342             Edward Binns
1212031               Ken Lynch
Name: name, dtype: object

### Display the entire cast, in "n"-order, of the 1972 film "Sleuth".

In [49]:
cast[(cast.title == "Sleuth") & (cast.year == 1972) & (cast.n.notnull())].sort_values('n') #filter out notnull values

Unnamed: 0,title,year,name,type,character,n
1493547,Sleuth,1972,Laurence Olivier,actor,Andrew Wyke,1
284482,Sleuth,1972,Michael Caine,actor,Milo Tindle,2
326198,Sleuth,1972,Alec Cawthorne,actor,Inspector Doppler,3
1282808,Sleuth,1972,John (II) Matthews,actor,Detective Sergeant Tarrant,4
2374004,Sleuth,1972,Eve (III) Channing,actress,Marguerite Wyke,5
1268350,Sleuth,1972,Teddy Martin,actor,Police Constable Higgs,6


### Now display the entire cast, in "n"-order, of the 2007 version of "Sleuth".

In [50]:
cast[(cast.title == "Sleuth") & (cast.year == 2007) & (cast.n.notnull())].sort_values('n') #filter out notnull values

Unnamed: 0,title,year,name,type,character,n
284483,Sleuth,2007,Michael Caine,actor,Andrew,1
1131876,Sleuth,2007,Jude Law,actor,Milo,2
1580975,Sleuth,2007,Harold Pinter,actor,Man on T.V.,3


### How many roles were credited in the silent 1921 version of Hamlet?

In [51]:
len(cast[(cast.title == "Hamlet") & (cast.year == 1921)])

9

### How many roles were credited in Branagh’s 1996 Hamlet?

In [52]:
len(cast[(cast.title == "Hamlet") & (cast.year == 1996)])

55

### How many "Hamlet" roles have been listed in all film credits through history?

In [53]:
len(cast[(cast.title == "Hamlet")])

299

### How many people have played an "Ophelia"?

In [54]:
cast[(cast.character == "Ophelia") & (cast.n.notnull())].sort_values('year') #curiosity lookup, sort by year and omit NaNs

Unnamed: 0,title,year,name,type,character,n
3048189,Hamlet,1911,Emilie Sannom,actress,Ophelia,4
2489287,Hamlet,1913,Gertrude Elliot,actress,Ophelia,21
2800756,Amleto,1917,Helena Makowska,actress,Ophelia,2
2878673,Uncle Tom's Cabin,1918,Mrs. Priestly Morrison,actress,Ophelia,11
2659548,Hamlet,1921,Lilly Jacobson,actress,Ophelia,8
2954787,Big Dan,1923,Mattie Peters,actress,Ophelia,11
2836328,"The Long, Long Trail",1929,Kathryn McGuire,actress,Ophelia,3
2267675,Khoon Ka Khoon,1935,Naseem Banu,actress,Ophelia,2
2536436,Immortal Gentleman,1935,Rosalinde Fuller,actress,Ophelia,2
2281254,Virginia,1941,Louise Beavers,actress,Ophelia,10


In [55]:
cast[(cast.character == "Ophelia") & (cast.n.notnull())].sort_values('year') #curiosity lookup, sort by year and omit NaNs

Unnamed: 0,title,year,name,type,character,n
3048189,Hamlet,1911,Emilie Sannom,actress,Ophelia,4
2489287,Hamlet,1913,Gertrude Elliot,actress,Ophelia,21
2800756,Amleto,1917,Helena Makowska,actress,Ophelia,2
2878673,Uncle Tom's Cabin,1918,Mrs. Priestly Morrison,actress,Ophelia,11
2659548,Hamlet,1921,Lilly Jacobson,actress,Ophelia,8
2954787,Big Dan,1923,Mattie Peters,actress,Ophelia,11
2836328,"The Long, Long Trail",1929,Kathryn McGuire,actress,Ophelia,3
2267675,Khoon Ka Khoon,1935,Naseem Banu,actress,Ophelia,2
2536436,Immortal Gentleman,1935,Rosalinde Fuller,actress,Ophelia,2
2281254,Virginia,1941,Louise Beavers,actress,Ophelia,10


In [56]:
len(cast[(cast.character == "Ophelia")])

95

### How many people have played a role called "The Dude"?

In [57]:
cast.head(3)

Unnamed: 0,title,year,name,type,character,n
0,Suuri illusioni,1985,Homo $,actor,Guests,22.0
1,Gangsta Rap: The Glockumentary,2007,Too $hort,actor,Himself,
2,Menace II Society,1993,Too $hort,actor,Lew-Loc,27.0


In [58]:
cast[cast.character == "The Dude"].head(3)

Unnamed: 0,title,year,name,type,character,n
236061,The Big Lebowski,1998,Jeff Bridges,actor,The Dude,1.0
295611,Terms & Conditions,2015,Jordan Cann,actor,The Dude,9.0
435996,Stranger,2000,Scott Crowell,actor,The Dude,


In [59]:
cast[(cast.character == "The Dude") & (cast.n.notnull())].sort_values(['year','n'])

Unnamed: 0,title,year,name,type,character,n
1349942,Wild Oats Lane,1926,Jerry Miley,actor,The Dude,6
534415,Sweepstakes,1931,Mike Donlin,actor,The Dude,12
1562277,Oklahoma Badlands,1948,House Peters Jr.,actor,The Dude,13
2040820,Zachariah,1971,Dick Van Patten,actor,The Dude,9
1213686,The Winds of Autumn,1976,Steve Lyons,actor,The Dude,16
1609644,Hacks,1997,Jason Priestley,actor,The Dude,11
236061,The Big Lebowski,1998,Jeff Bridges,actor,The Dude,1
956144,Jay and Silent Bob Strike Back,2001,Matthew (XIX) James,actor,The Dude,37
998755,Explicit Ills,2008,Christopher Kadish,actor,The Dude,27
1196592,American Idiots,2013,Jason Loughridge,actor,The Dude,10


In [1]:
len(cast[cast.character == "The Dude"])

NameError: name 'cast' is not defined

### How many people have played a role called "The Stranger"?

In [60]:
len(cast[cast.character == "The Stranger"])

188

### How many roles has Sidney Poitier played throughout his career?

In [61]:
len(cast[cast.name == "Sidney Poitier"].sort_values('year'))

43

In [62]:
cast[cast.name == "Sidney Poitier"].sort_values('year').head(3)

Unnamed: 0,title,year,name,type,character,n
1589897,Sepia Cinderella,1947,Sidney Poitier,actor,Night Club Extra,
1589892,No Way Out,1950,Sidney Poitier,actor,Dr. Luther Brooks,4.0
1589881,"Cry, the Beloved Country",1951,Sidney Poitier,actor,Reverend Msimangu,3.0


### How many roles has Judi Dench played?

In [63]:
len(cast[cast.name == "Judi Dench"])

51

### List the supporting roles (having n=2) played by Cary Grant in the 1940s, in order by year.

In [64]:
cast[(cast.name == "Cary Grant") & (cast.n == 2) & (cast.year >= 1940) & (cast.year <= 1949)]

Unnamed: 0,title,year,name,type,character,n
763072,My Favorite Wife,1940,Cary Grant,actor,Nick,2
763082,Penny Serenade,1941,Cary Grant,actor,Roger Adams,2


In [65]:
cast[(cast.name == "Cary Grant") & (cast.n == 2) & (cast.year >= 1940) & (cast.year <= 1949)].character

763072           Nick
763082    Roger Adams
Name: character, dtype: object

### List the leading roles that Cary Grant played in the 1940s in order by year.

In [66]:
cast[(cast.name == "Cary Grant") & 
     (cast.n == 1) & 
     (cast.year >= 1940) & 
     (cast.year <= 1949)].sort_values('year')

Unnamed: 0,title,year,name,type,character,n
763097,The Howards of Virginia,1940,Cary Grant,actor,Matt Howard,1
763054,His Girl Friday,1940,Cary Grant,actor,Walter Burns,1
763099,The Philadelphia Story,1940,Cary Grant,actor,C. K. Dexter Haven,1
763087,Suspicion,1941,Cary Grant,actor,Johnnie,1
763101,The Talk of the Town,1942,Cary Grant,actor,Leopold Dilg,1
763078,Once Upon a Honeymoon,1942,Cary Grant,actor,Patrick 'Pat' O'Toole,1
763045,Destination Tokyo,1943,Cary Grant,actor,Capt. Cassidy,1
763070,Mr. Lucky,1943,Cary Grant,actor,Joe Adams,1
763071,Mr. Lucky,1943,Cary Grant,actor,Joe Bascopolous,1
763079,Once Upon a Time,1944,Cary Grant,actor,Jerry Flynn,1


In [67]:
cast[(cast.name == "Cary Grant") & (cast.n == 1) & (cast.year >= 1940) & (cast.year <= 1949)].sort_values('year').character

763097              Matt Howard
763054             Walter Burns
763099       C. K. Dexter Haven
763087                  Johnnie
763101             Leopold Dilg
763078    Patrick 'Pat' O'Toole
763045            Capt. Cassidy
763070                Joe Adams
763071          Joe Bascopolous
763079              Jerry Flynn
763037        Mortimer Brewster
763074               Ernie Mott
763073              Cole Porter
763076                   Devlin
763093                     Dick
763094                   Dudley
763069            Jim Blandings
763049        Dr. Madison Brown
763058      Capt. Henri Rochard
Name: character, dtype: object

### How many roles were available for actors in the 1950s?

In [68]:
len(cast[(cast.type == "actor") & (cast.year >= 1950) & (cast.year <= 1959)])

146895

### How many roles were avilable for actresses in the 1950s?

In [69]:
len(cast[(cast.type == "actress") & (cast.year >= 1950) & (cast.year <= 1959)])

53623

### How many leading roles (n=1) were available from the beginning of film history through 1980?

In [70]:
len(cast[(cast.n == 1) & (cast.year <= 1980)])

61149

### How many non-leading roles were available through from the beginning of film history through 1980?

In [71]:
len(cast[(cast.n != 1) & (cast.year <= 1980)])

1041286

### How many roles through 1980 were minor enough that they did not warrant a numeric "n" rank?

In [72]:
len(cast[(cast.n.isnull()) & (cast.year <= 1980)])

411708

### Extras

In [73]:
titles.head(3)

Unnamed: 0,title,year
0,Insane,2010
1,Dreamer,2011
2,Down in San Diego,1941


In [74]:
cast.head(3)

Unnamed: 0,title,year,name,type,character,n
0,Suuri illusioni,1985,Homo $,actor,Guests,22.0
1,Gangsta Rap: The Glockumentary,2007,Too $hort,actor,Himself,
2,Menace II Society,1993,Too $hort,actor,Lew-Loc,27.0


In [75]:
cast[cast.name == "Hugh Laurie"].sort_values('year').tail(3)

Unnamed: 0,title,year,name,type,character,n
1129696,Arthur Christmas,2011,Hugh Laurie,actor,Steve,2.0
1129704,Mr. Pip,2012,Hugh Laurie,actor,Mr. Watts,1.0
1129719,Tomorrowland,2015,Hugh Laurie,actor,David Nix,


In [76]:
performer = "Bruce Willis"

def lookup_performer_movies(performer):
    cast[(cast.name == performer) & (cast.character != performer)].sort_values(['year','n'])
    
lookup_performer_movies(performer)

In [77]:
cast[(cast.character == "M") & (cast.type == "actress") & (cast.n.notnull()) & (cast.n < 10)].sort_values('year')

Unnamed: 0,title,year,name,type,character,n
2900279,Flaming Ears,1992,Margarete Neumann,actress,M,4
2889955,Le journal de Lady M,1993,Myriam M?zi?res,actress,M,1
2449996,GoldenEye,1995,Judi Dench,actress,M,6
2450032,Tomorrow Never Dies,1997,Judi Dench,actress,M,9
2503357,Derakhte Golabi,1998,Golshifteh Farahani,actress,M,2
2450031,The World Is Not Enough,1999,Judi Dench,actress,M,6
2449993,Die Another Day,2002,Judi Dench,actress,M,6
2817628,Greener Mountains,2005,Nan Martin,actress,M,5
2449990,Casino Royale,2006,Judi Dench,actress,M,4
3081574,Second Moon,2006,Jennifer Shin,actress,M,2


In [78]:
cast[(cast.name == "Judi Dench") & (cast.character == "M")].sort_values('year')

Unnamed: 0,title,year,name,type,character,n
2449996,GoldenEye,1995,Judi Dench,actress,M,6
2450032,Tomorrow Never Dies,1997,Judi Dench,actress,M,9
2450031,The World Is Not Enough,1999,Judi Dench,actress,M,6
2449993,Die Another Day,2002,Judi Dench,actress,M,6
2449990,Casino Royale,2006,Judi Dench,actress,M,4
2450016,Quantum of Solace,2008,Judi Dench,actress,M,4
2450021,Skyfall,2012,Judi Dench,actress,M,2
