In [1]:
import ast
import operator
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
import nltk.data
from nltk.corpus import stopwords
%matplotlib inline
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Windows\ServiceProfile
[nltk_data]     s\LocalService\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

One approach to handle multiple genres per movie is to use OneVsRestClassifier.

In [2]:
TMDB_MOVIES_COLUMN_NAMES = [
    'adult', 'backdrop_path', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id', 'imdb_id',
    'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'production_companies',
    'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
    'video', 'vote_average', 'vote_count',
]

def load_tmdb_movies():
    df = pd.read_csv(
        'Milestone_1/tmdb_movies_11291.csv', header=None, names=TMDB_MOVIES_COLUMN_NAMES, encoding="utf-8")
    for column_name in ['genres', 'spoken_languages']:
        df[column_name] = df[column_name].map(lambda d: ast.literal_eval(d))
    return df

Load movies and prepare the training set by filtering out movies without genres or without reviews

In [3]:
tmdb_movies_df = load_tmdb_movies()
has_overview = ~tmdb_movies_df['overview'].isnull()
has_genre = ~tmdb_movies_df['genres'].apply(lambda x: not x )
tmdb_movies_with_overview = tmdb_movies_df[has_overview & has_genre]

Convert the overviews to a matrix of token counts

In [4]:
vectorizer = CountVectorizer( 
    stop_words = stopwords.words("english"), 
    token_pattern = '[a-zA-Z]+[0-9]*',
    max_df = 0.9, 
    min_df = 5, 
    dtype=np.float32 )
X = vectorizer.fit_transform(tmdb_movies_with_overview['overview'].values).toarray()
print 'predictor matrix shape:', X.shape

predictor matrix shape: (11022L, 8707L)


In [5]:
feature_names = np.array(vectorizer.get_feature_names())
feature_names

array([u'aaron', u'abandon', u'abandoned', ..., u'zone', u'zoo', u'zooey'], 
      dtype='<U17')

Prepare labels for each movie

In [6]:
labels = tmdb_movies_with_overview['genres'].apply(lambda x: [g['name'] for g in x])
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(labels)
print 'label matrix shape:', y.shape

label matrix shape: (11022L, 20L)


In [7]:
mlb.classes_

array([u'Action', u'Adventure', u'Animation', u'Comedy', u'Crime',
       u'Documentary', u'Drama', u'Family', u'Fantasy', u'Foreign',
       u'History', u'Horror', u'Music', u'Mystery', u'Romance',
       u'Science Fiction', u'TV Movie', u'Thriller', u'War', u'Western'], dtype=object)

Fit the classifier

In [8]:
oneVsResClassifier = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X, y)

In [9]:
predict = oneVsResClassifier.predict(X)

In [10]:
print 'First observation actual label', y[1]

First observation actual label [1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0]


In [11]:
print 'First observation predicted label: ', y[1]

First observation predicted label:  [1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0]


In [12]:
print 'Accuracy on the training set: ', oneVsResClassifier.score(X, y)

Accuracy on the training set:  0.992741789149
