In [1]:
#import all the things
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#set view stuff (only needed for draft in Jupyter notebook)
pd.set_option('display.max_columns', 100)

In [2]:
#read in scraped data
movie_data = pd.read_csv("IMDBTop250.csv")
movie_data.head()

Unnamed: 0.1,Unnamed: 0,Actors,Awards,Country,Director,Genre,Language,Metascore,Plot,Poster,Rated,Released,Response,Runtime,Title,Type,Writer,Year,imdbID,imdbRating,imdbVotes
0,0,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Nominated for 7 Oscars. Another 19 wins & 30 n...,USA,Frank Darabont,"Crime, Drama",English,80.0,Two imprisoned men bond over a number of years...,https://images-na.ssl-images-amazon.com/images...,R,14 Oct 1994,True,142 min,The Shawshank Redemption,movie,"Stephen King (short story ""Rita Hayworth and S...",1994,tt0111161,9.3,1786262
1,1,"Marlon Brando, Al Pacino, James Caan, Richard ...",Won 3 Oscars. Another 23 wins & 27 nominations.,USA,Francis Ford Coppola,"Crime, Drama","English, Italian, Latin",100.0,The aging patriarch of an organized crime dyna...,https://images-na.ssl-images-amazon.com/images...,R,24 Mar 1972,True,175 min,The Godfather,movie,"Mario Puzo (screenplay), Francis Ford Coppola ...",1972,tt0068646,9.2,1219320
2,2,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",Won 6 Oscars. Another 10 wins & 20 nominations.,USA,Francis Ford Coppola,"Crime, Drama","English, Italian, Spanish, Latin, Sicilian",80.0,The early life and career of Vito Corleone in ...,https://images-na.ssl-images-amazon.com/images...,R,20 Dec 1974,True,202 min,The Godfather: Part II,movie,"Francis Ford Coppola (screenplay), Mario Puzo ...",1974,tt0071562,9.0,839135
3,3,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",Won 2 Oscars. Another 146 wins & 142 nominations.,"USA, UK",Christopher Nolan,"Action, Crime, Drama","English, Mandarin",82.0,When the menace known as the Joker wreaks havo...,https://images-na.ssl-images-amazon.com/images...,PG-13,18 Jul 2008,True,152 min,The Dark Knight,movie,"Jonathan Nolan (screenplay), Christopher Nolan...",2008,tt0468569,9.0,1754213
4,4,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",Nominated for 3 Oscars. Another 16 wins & 8 no...,USA,Sidney Lumet,"Crime, Drama",English,,A jury holdout attempts to prevent a miscarria...,https://images-na.ssl-images-amazon.com/images...,APPROVED,01 Apr 1957,True,96 min,12 Angry Men,movie,"Reginald Rose (story), Reginald Rose (screenplay)",1957,tt0050083,8.9,481606


In [3]:
movie_data["Rated"].value_counts()

R            101
PG            37
NOT RATED     32
PG-13         30
APPROVED      20
G             14
UNRATED       10
PASSED         3
Name: Rated, dtype: int64

In [4]:
movie_data["Rated"].isnull().sum()

3

In [5]:
def not_rated(column):
    return column.fillna("NOT RATED", inplace=True)

In [6]:
not_rated(movie_data["Rated"])

In [7]:
movie_data["Rated"].value_counts()

R            101
PG            37
NOT RATED     35
PG-13         30
APPROVED      20
G             14
UNRATED       10
PASSED         3
Name: Rated, dtype: int64

#### Data Cleaning

In [8]:
movie_data.isnull().sum()

Unnamed: 0     0
Actors         0
Awards         6
Country        0
Director       0
Genre          0
Language       1
Metascore     89
Plot           0
Poster         0
Rated          0
Released       3
Response       0
Runtime        0
Title          0
Type           0
Writer         0
Year           0
imdbID         0
imdbRating     0
imdbVotes      0
dtype: int64

In [9]:
movie_data.dtypes

Unnamed: 0      int64
Actors         object
Awards         object
Country        object
Director       object
Genre          object
Language       object
Metascore     float64
Plot           object
Poster         object
Rated          object
Released       object
Response         bool
Runtime        object
Title          object
Type           object
Writer         object
Year            int64
imdbID         object
imdbRating    float64
imdbVotes      object
dtype: object

In [10]:
#strip "min" from runtime (maybe retitle as runtime_minutes?)
movie_data["Runtime"] = movie_data["Runtime"].str.replace(" min","")

In [11]:
#turn runtime into number
movie_data["Runtime"] = movie_data["Runtime"].apply(int)
movie_data["Runtime"].head()

0    142
1    175
2    202
3    152
4     96
Name: Runtime, dtype: int64

In [12]:
#get rid of commas--but don't actually need this!
movie_data["imdbVotes"] = movie_data["imdbVotes"].str.replace(",", "")
movie_data["imdbVotes"].head()

0    1786262
1    1219320
2     839135
3    1754213
4     481606
Name: imdbVotes, dtype: object

In [13]:
 #and change to int--don't end up needing this!
movie_data["imdbVotes"] = movie_data["imdbVotes"].apply(int)
movie_data["imdbVotes"].head()

0    1786262
1    1219320
2     839135
3    1754213
4     481606
Name: imdbVotes, dtype: int64

In [14]:
#Note: do we want to turn released into datetime?
#Eh, not sure the month and day of release makes any difference
#because spread out over so many years; year seems adequate

In [15]:
#director column pipeline
#two options:
#Label Binarizer or 
#Count Vectorizer (using parameters that use groups of 2 words and cuts off frequency of appear in more than two columns)
from sklearn.preprocessing import LabelBinarizer

In [16]:
lb = LabelBinarizer()
director_bin = lb.fit_transform(movie_data["Director"])

In [17]:
lb.classes_

array(['Aamir Khan, Amole Gupte', 'Adam Elliot', 'Akira Kurosawa',
       'Alejandro G. Iñárritu', 'Alfred Hitchcock', 'Andrei Tarkovsky',
       'Andrew Stanton', 'Andrew Stanton, Lee Unkrich', 'Anurag Kashyap',
       'Asghar Farhadi', 'Ashutosh Gowariker', 'Billy Wilder',
       'Brian De Palma', 'Bryan Singer',
       'Byron Howard, Rich Moore, Jared Bush', 'Carl Theodor Dreyer',
       'Carol Reed', 'Chan-wook Park', 'Charles Chaplin',
       'Christopher Nolan', 'Clint Eastwood',
       'Clyde Bruckman, Buster Keaton', 'Curtis Hanson', 'Damien Chazelle',
       'Damián Szifrón', 'Danny Boyle', 'Darren Aronofsky',
       'David Fincher', 'David Lean', 'David Lynch', 'David Yates',
       'Dean DeBlois, Chris Sanders', 'Denis Villeneuve', 'Elem Klimov',
       'Elia Kazan', 'Ethan Coen, Joel Coen', 'F.W. Murnau',
       'Federico Fellini', 'Fernando Meirelles, Kátia Lund',
       'Florian Henckel von Donnersmarck', 'Francis Ford Coppola',
       'Frank Capra', 'Frank Darabont', 'Fr

In [18]:
director_bin[0:5]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 

In [19]:
rated_bin = lb.fit_transform(movie_data["Rated"])

In [20]:
lb.classes_

array(['APPROVED', 'G', 'NOT RATED', 'PASSED', 'PG', 'PG-13', 'R',
       'UNRATED'], 
      dtype='<U9')

In [21]:
rated_bin

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ..., 
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [22]:
#actor column try CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
countv = CountVectorizer(ngram_range=(2, 3), min_df=2)

actors_cv = countv.fit_transform(movie_data["Actors"])

In [23]:
actors_cv.todense()[0:5]

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [25]:
#looking at this in a dataframe because curious to see
df = pd.DataFrame(actors_cv.todense(), columns=countv.get_feature_names())
df.head()

Unnamed: 0,aamir khan,adrien brody,al pacino,al pacino robert,alec guinness,alexandra maria,alexandra maria lara,anatoliy solonitsyn,anne moss,anthony quinn,arnold schwarzenegger,bale michael,bale michael caine,ben affleck,ben kingsley,benicio del,benicio del toro,bette davis,bibi andersson,billy dee,billy dee williams,brad pitt,bruce willis,carrie anne,carrie anne moss,carrie fisher,carrie fisher billy,chapman john,chapman john cleese,charles chaplin,charles vanel,charlton heston,christian bale,christian bale michael,christoph waltz,christopher plummer,christopher walken,claude rains,claudia cardinale,clint eastwood,clint eastwood lee,daniel day,daniel day lewis,day lewis,de niro,dee williams,del toro,diane keaton,eastwood lee,eastwood lee van,...,martin sheen,matt damon,matthew mcconaughey,max von,max von sydow,michael biehn,michael caine,morgan freeman,murray abraham,natalie portman,newman robert,newman robert redford,nikolay grinko,noel appleby,orson welles,pacino robert,patrick magee,paul newman,paul newman robert,paul reiser,peter lorre,ralph fiennes,richard attenborough,robert de,robert de niro,robert duvall,robert redford,robert shaw,russell crowe,sala baker,sean astin,sharman joshi,sigourney weaver,steve buscemi,takashi shimura,tatsuya nakadai,terry gilliam,thomas mitchell,tim allen,tim roth,tom hanks,tom hanks tim,tom hardy,toni collette,toshirô mifune,van cleef,viola davis,von sydow,willem dafoe,william holden
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [26]:
df.sum().sort_values().head(20)

gordon levitt        2
john cazale          2
jim carrey           2
jennifer connelly    2
jay baruchel         2
janet leigh          2
james woods          2
robert redford       2
jake gyllenhaal      2
robert shaw          2
jack lemmon          2
willem dafoe         2
hugo weaving         2
john cleese          2
sala baker           2
henry bergman        2
sharman joshi        2
hanks tim allen      2
hanks tim            2
sigourney weaver     2
dtype: int64

In [27]:
countv.vocabulary_

{'aamir khan': 0,
 'adrien brody': 1,
 'al pacino': 2,
 'al pacino robert': 3,
 'alec guinness': 4,
 'alexandra maria': 5,
 'alexandra maria lara': 6,
 'anatoliy solonitsyn': 7,
 'anne moss': 8,
 'anthony quinn': 9,
 'arnold schwarzenegger': 10,
 'bale michael': 11,
 'bale michael caine': 12,
 'ben affleck': 13,
 'ben kingsley': 14,
 'benicio del': 15,
 'benicio del toro': 16,
 'bette davis': 17,
 'bibi andersson': 18,
 'billy dee': 19,
 'billy dee williams': 20,
 'brad pitt': 21,
 'bruce willis': 22,
 'carrie anne': 23,
 'carrie anne moss': 24,
 'carrie fisher': 25,
 'carrie fisher billy': 26,
 'chapman john': 27,
 'chapman john cleese': 28,
 'charles chaplin': 29,
 'charles vanel': 30,
 'charlton heston': 31,
 'christian bale': 32,
 'christian bale michael': 33,
 'christoph waltz': 34,
 'christopher plummer': 35,
 'christopher walken': 36,
 'claude rains': 37,
 'claudia cardinale': 38,
 'clint eastwood': 39,
 'clint eastwood lee': 40,
 'daniel day': 41,
 'daniel day lewis': 42,
 'day

In [39]:
#genre column try CountVectorizer
countv_2 = CountVectorizer(ngram_range=(1, 1))

genre_cv = countv_2.fit_transform(movie_data["Genre"])

In [40]:
genre_cv[0:5]

<5x23 sparse matrix of type '<class 'numpy.int64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [43]:
#looking at this in a dataframe because curious to see
df_genre = pd.DataFrame(genre_cv.todense(), columns=countv_2.get_feature_names())
df_genre

Unnamed: 0,action,adventure,animation,biography,comedy,crime,drama,family,fantasy,fi,film,history,horror,music,musical,mystery,noir,romance,sci,sport,thriller,war,western
0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
9,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [46]:
#dummies for year
year_dummies = pd.get_dummies(movie_data["Year"])
year_dummies

Unnamed: 0,1921,1925,1926,1927,1928,1931,1934,1936,1939,1940,1941,1942,1944,1946,1948,1949,1950,1951,1952,1953,1954,1955,1957,1958,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [45]:
movie_data["Year"].nunique()

82

### operationalize y

For the top 250 movies split them up into two categories by rating: Amazing and Good. Assign 1 to Amazing, 0 to Good. Splits is on the median (1 is <= median). Make this column the y variable called "is_amazing".

In [28]:
movie_data["imdbRating"].value_counts()

8.1    57
8.3    45
8.2    45
8.4    29
8.5    28
8.6    14
8.7     9
8.0     8
8.9     6
8.8     5
9.0     2
9.3     1
9.2     1
Name: imdbRating, dtype: int64

In [29]:
movie_data["imdbRating"].describe()

count    250.000000
mean       8.329600
std        0.233198
min        8.000000
25%        8.100000
50%        8.300000
75%        8.500000
max        9.300000
Name: imdbRating, dtype: float64

In [30]:
np.median(movie_data["imdbRating"])

8.3000000000000007

In [31]:
#this is the target variable
is_amazing = movie_data["imdbRating"].apply(lambda x: 0 if x <= np.median(movie_data["imdbRating"]) else 1)
is_amazing[0:5]

0    1
1    1
2    1
3    1
4    1
Name: imdbRating, dtype: int64