In [5]:
%matplotlib inline
import pandas as pd

In [6]:
from IPython.core.display import HTML
css = open('style-table.css').read() + open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))

In [7]:
%%time
cast = pd.DataFrame.from_csv('data/cast.csv', index_col=None)

CPU times: user 7.91 s, sys: 533 ms, total: 8.44 s
Wall time: 9.15 s


In [8]:
cast.head()

Unnamed: 0,title,year,name,type,character,n
0,The Core,2003,Alejandro Abellan,actor,U.S.S. Soldier,
1,Il momento di uccidere,1968,Remo De Angelis,actor,Dago,9.0
2,Across the Divide,1921,Thomas Delmar,actor,Dago,4.0
3,Revan,2012,Diego James,actor,Dago,
4,Un homme marche dans la ville,1950,Fabien Loris,actor,Dago,12.0


In [9]:
%%time
release_dates = pd.read_csv('data/release_dates.csv', index_col=None,
                            parse_dates=['date'], infer_datetime_format=True)

CPU times: user 1.89 s, sys: 72.4 ms, total: 1.96 s
Wall time: 2.04 s


In [10]:
release_dates.head()

Unnamed: 0,title,year,country,date
0,3orthographies,2013,Canada,2013-05-20
1,11 Minutes (I),2014,USA,2014-05-27
2,A Gypsy Girl's Love,1908,USA,1908-08-08
3,A Woman Scorned,1999,USA,1999-08-05
4,A Woman Scorned,1999,Germany,2000-04-09


In [54]:
titles = cast[['title', 'year']].drop_duplicates().reset_index(drop=True)
titles.head()

Unnamed: 0,title,year
0,The Core,2003
1,Il momento di uccidere,1968
2,Across the Divide,1921
3,Revan,2012
4,Un homme marche dans la ville,1950


### Titles

In [139]:
# 1. What movies were made through 1893?

titles[titles.year <= 1893]

Unnamed: 0,title,year
276922,Roundhay Garden Scene,1888
381723,Je vous aime,1891
385597,Le prince de Galles,1892
386265,Accordion Player,1888
461820,Sallie Gardner at a Gallop,1878


In [137]:
# 1. What movies have titles that fall between Star Trek and Star Wars in the alphabet?

titles[(titles.title > 'Star Trek') & (titles.title < 'Star Wars')].shape

(46, 2)

In [46]:
# 1. What are the 20 most common movie titles?

titles.title.value_counts().head(20)

Hamlet                  19
The Kiss                19
Vengeance               16
Popular Science (II)    15
Popular Science (I)     15
Macbeth                 14
Easy Money              14
Carmen                  14
David                   14
Deception               14
Resurrection            14
The Best Man            14
The Stranger            13
Secrets                 13
The Tell-Tale Heart     13
Blind Date (I)          13
The Kid                 13
Salome                  13
The Mirror              13
Amok                    13
dtype: int64

In [138]:
# 2. Use an index and .loc[] to find the movies whose titles fall between Star Trek
#    and Star Wars in the alphabet.

t = titles.copy()
t = t.set_index('title').sort_index()
t.loc['Star Trek':'Star Wars'].shape

(48, 1)

In [128]:
# 2. Use an index and .loc[] to retrieve the names of the movies made through 1893.

titles.set_index('year').sort_index().loc[1800:1893]

Unnamed: 0_level_0,title
year,Unnamed: 1_level_1
1878,Sallie Gardner at a Gallop
1888,Accordion Player
1888,Roundhay Garden Scene
1891,Je vous aime
1892,Le prince de Galles


In [82]:
# 5. What are the 15 longest movie titles ever?

pd.set_option('max_colwidth', 300)

t = titles.copy()
t['len'] = t.title.str.len()
t = t.sort('len', ascending=False)
t.head(15)

Unnamed: 0,title,year,len
381602,"Direktør Ivar Knudsen til prøvesejlads om bord paa et dieselskib, omgivet af repræsentanter for handel og industri, hvoriblandt admiral A. de Richelieu, etatsraad Martin Dessau, gehejmeetatsraad Julius Larsen, baron Blixen-Finecke og flere",1913,239
283944,"Le voyage du Président Félix Faure en Russie (Août 1897): L'impératrice descend l'escalier rouge du palais de Saint-Pétersbourg, accompagnée des dames d'honneur et suivie de tous les princes et grands ducs de la cour impériale",1897,226
135788,"H.R.H. The Prince of Wales Decorating the Monument of Champlain and Receiving Addresses of Welcome from the Mayor of Quebec, the Governor General of Canada and Vice President Fairbanks, Representative of the United States",1908,221
377465,"Barney Ross of Chicago, in Defense of His World's Championship Welterweight Title Against Henry Armstrong of California, Featherweight Champion, Held at the Madison Square Garden Bowl, Long Island City, New York",1938,211
350366,"Night of the Day of the Dawn of the Son of the Bride of the Return of the Revenge of the Terror of the Attack of the Evil Mutant Hellbound Flesh Eating Crawling Alien Zombified Subhumanoid Living Dead, Part 5",2011,208
316491,"Night of the Day of the Dawn of the Son of the Bride of the Return of the Revenge of the Terror of the Attack of the Evil, Mutant, Hellbound, Flesh-Eating Subhumanoid Zombified Living Dead, Part 3",2005,196
270643,"Portrait de groupe n° 142: Les cinématonés célèbrent devant le palais de Chaillot la rétrospective de la Cinémathèque française Cinématons, Autres Films, Carte blanche",1991,167
220775,Las poquianchis (De los pormenores y otros sucedidos del dominio público que acontecieron a las hermanas de triste memoria a quienes la maledicencia así las bautizó),1976,165
385201,Der Kaiser in Halle am 6. September 1903 - Ankunft Sr. Majestät des Kaisers und König Georg von Sachsen nebst Gefolge in Leipzig am 5. September 1903,1903,149
61665,Entrei em Pânico ao Saber o que Vocês Fizeram na Sexta-feira 13 do Verão Passado Parte 2 - A Hora da Volta da Vingança dos Jogos Mortais de Halloween,2011,149


In [47]:
# 5. What are the 20 most popular movie titles, if you strip off the suffixes like
#    (II) and (III) that IMDB adds to distinguish movies shown in the same year?

titles.title.str.extract('^([^(]*)').value_counts().head(20)

                    100
Home                 97
Alone                91
Broken               84
The Interview        75
The Gift             65
The Box              63
Blind Date           60
Gone                 53
The End              52
Last Call            48
Reunion              47
The Bridge           46
The Audition         45
Popular Science      44
Smile                44
Stuck                44
Blackout             43
Run                  43
Trapped              41
dtype: int64

### Years

In [56]:
# 1. How many movies came out in 1950?

(titles.year == 1950).sum()

1542

In [60]:
# 1. Use a for loop to determine how many movies came out in each year of the 1970s.

for y in range(1970, 1980):
    print(y, (titles.year == y).sum())

1970 2311
1971 2297
1972 2289
1973 2294
1974 2281
1975 2264
1976 2291
1977 2198
1978 2282
1979 2277
