In [1]:
import ast
import operator
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.feature_extraction.text import CountVectorizer
import nltk.data
from nltk.corpus import stopwords
%matplotlib inline
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Windows\ServiceProfile
[nltk_data]     s\LocalService\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Prints most influential positive and negative terms from the movies overview. Later we could extend this to analyse movies scripts.

In [2]:
TMDB_MOVIES_COLUMN_NAMES = [
    'adult', 'backdrop_path', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id', 'imdb_id',
    'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'production_companies',
    'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
    'video', 'vote_average', 'vote_count',
]

def load_tmdb_movies():
    df = pd.read_csv('Milestone_1/tmdb_movies_11291.csv', header=None, names=TMDB_MOVIES_COLUMN_NAMES)
    for column_name in ['genres', 'spoken_languages']:
        df[column_name] = df[column_name].map(lambda d: ast.literal_eval(d))
    return df

In [3]:
def getY(df, genre_name):
    def hasGenre(genres, genre_name):
        for genre in genres:
            if(genre['name'] == genre_name):
                return 1.0
        return 0.0
    return np.array([hasGenre(genre_list, genre_name) for genre_list in df['genres']])

def cross_validate(x, y, folds, reg_params):
    kf = KFold(x.shape[0], n_folds=folds)
    cv_score = np.zeros(reg_params.size)
    for i, c in enumerate(reg_params):
        reg = LogReg(penalty='l1', C = c)
        score_sum = 0.0
        for train_index, test_index in kf:
            reg.fit(x[train_index], y[train_index])
            score_sum += reg.score(x[test_index], y[test_index])
        cv_score[i] = score_sum/float(folds)
    return cv_score

In [4]:
def getInfluentialTerms(x, genre_name):
    # get labels
    y = getY(tmdb_movies_with_overview, genre_name)
    # cross-validate for best regularization parameter
    all_c = np.power(10., range(-7, 8))
    cv_scores = cross_validate(x, y, 10, all_c)
    best_c = all_c[np.argmax(cv_scores)]
    # fit logistic regression
    logReg = LogReg(penalty='l1', C = best_c)
    logReg.fit(x, y)
    coef = logReg.coef_[0]
    # top and bottom percentiles
    top_1 = [coef >= np.percentile(coef, 99)]
    bottom_1 = [coef <= np.percentile(coef, 1)]
    return top_1, bottom_1

In [5]:
tmdb_movies_df = load_tmdb_movies()
has_overview = ~tmdb_movies_df['overview'].isnull()
tmdb_movies_with_overview = tmdb_movies_df[has_overview]

In [6]:
vectorizer = CountVectorizer( 
    stop_words = stopwords.words("english"), 
    token_pattern = '[a-zA-Z]+[0-9]*',
    max_df = 0.9, 
    min_df = 5, 
    dtype=np.float32 )
x = vectorizer.fit_transform(tmdb_movies_with_overview['overview'].values).toarray()
print 'predictor matrix shape:', x.shape

predictor matrix shape: (11145L, 8773L)


In [7]:
feature_names = np.array(vectorizer.get_feature_names())
feature_names

array([u'aaron', u'abandon', u'abandoned', ..., u'zone', u'zoo', u'zooey'], 
      dtype='<U17')

In [8]:
def printInfluentialTerms(genre_name):
    top_1, bottom_1 = getInfluentialTerms(x, genre_name)
    print genre_name, 'most influential positive terms:', feature_names[top_1]
    print genre_name, 'most influential negative terms:', feature_names[bottom_1]

In [9]:
printInfluentialTerms('Drama')

Drama most influential positive terms: [u'afterlife' u'aged' u'alcoholic' u'allied' u'anger' u'assistance'
 u'astronaut' u'ballet' u'banned' u'betrothed' u'blizzard' u'boxer'
 u'brien' u'build' u'capsule' u'chronicle' u'colonies' u'connections'
 u'constant' u'crushed' u'cycle' u'depression' u'detention' u'difficult'
 u'disturbed' u'drama' u'dramatic' u'elaborate' u'emotionally' u'enigmatic'
 u'erin' u'finest' u'forty' u'grief' u'guns' u'halt' u'hardened' u'heat'
 u'holly' u'holocaust' u'idealistic' u'incriminating' u'industrial'
 u'interpretation' u'islamic' u'janitor' u'josh' u'laden' u'laundry'
 u'loan' u'loveless' u'luis' u'luna' u'maid' u'miracle' u'morgan' u'mute'
 u'orphans' u'painter' u'patrick' u'performs' u'physician' u'poet'
 u'primary' u'prosecutor' u'pursue' u'raped' u'rehab' u'repercussions'
 u'roads' u'ruth' u'sail' u'shoes' u'skill' u'smitten' u'spying'
 u'stockholm' u'tempted' u'tennessee' u'therapist' u'tragedy' u'trevor'
 u'trucker' u'unorthodox' u'vast' u'vision' u'w

In [10]:
printInfluentialTerms('Comedy')

Comedy most influential positive terms: [u'abandonment' u'allan' u'annoying' u'applies' u'archie' u'asterix'
 u'attitudes' u'awkward' u'axe' u'brainy' u'bumbling' u'bunch' u'celebrity'
 u'cheap' u'cheerleader' u'chocolate' u'citizen' u'clinic' u'clouseau'
 u'clueless' u'comedic' u'comedy' u'comfortable' u'comic' u'confusion'
 u'cruchot' u'curmudgeonly' u'dating' u'examine' u'fake' u'fist'
 u'fraternity' u'furious' u'gary' u'grandpa' u'hapless' u'hilariously'
 u'horny' u'humor' u'hypochondriac' u'inadvertently' u'incompetent'
 u'intergalactic' u'inventor' u'irreverent' u'jokes' u'laced' u'leopold'
 u'lifeguard' u'lotus' u'mascot' u'mistakenly' u'mundane' u'nephew'
 u'patriarch' u'photos' u'positions' u'pretends' u'quirky' u'regina'
 u'relocated' u'reported' u'resolve' u'resourceful' u'retrieving' u'roy'
 u'sentenced' u'shallow' u'shrek' u'sixth' u'spend' u'spoof' u'stable'
 u'stripper' u'sure' u'surprisingly' u'swimming' u'tech' u'temporarily'
 u'thus' u'underway' u'unfortunate' u'uptig

In [11]:
printInfluentialTerms('Thriller')

Thriller most influential positive terms: [u'accident' u'agent' u'alive' u'apartment' u'assassin' u'begins' u'bond'
 u'car' u'cia' u'computer' u'conspiracy' u'cop' u'crime' u'criminal'
 u'criminals' u'dangerous' u'dark' u'dead' u'deadly' u'death' u'deep'
 u'detective' u'discover' u'discovers' u'drug' u'escape' u'events' u'ex'
 u'fear' u'former' u'goes' u'group' u'horror' u'hospital' u'hostage'
 u'house' u'identity' u'job' u'john' u'kidnapped' u'kill' u'killer'
 u'killers' u'killing' u'mark' u'may' u'mind' u'mission' u'mob' u'murder'
 u'murdered' u'murders' u'mysterious' u'mystery' u'nuclear' u'officer'
 u'past' u'phone' u'police' u'prey' u'psychiatrist' u'rachel' u'remote'
 u'revenge' u'run' u'sam' u'secret' u'security' u'seemingly' u'seems'
 u'serial' u'sinister' u'soldier' u'something' u'soon' u'supernatural'
 u'survival' u'taken' u'target' u'terror' u'terrorist' u'thriller'
 u'trapped' u'u' u'uncover' u'underworld' u'unknown' u'violent']
Thriller most influential negative terms: [u'

In [12]:
printInfluentialTerms('Action')

Action most influential positive terms: [u'advice' u'airborne' u'apes' u'archaeologist' u'assassin' u'assassins'
 u'audition' u'avenger' u'batman' u'benevolent' u'bud' u'bumbling' u'cache'
 u'caine' u'captors' u'celebrities' u'civilization' u'clayton' u'clone'
 u'commando' u'corporation' u'criminals' u'detailing' u'directly' u'donor'
 u'emerge' u'enforcement' u'enterprise' u'exceptional' u'exiled' u'fail'
 u'fearsome' u'fighters' u'firefighter' u'flynn' u'hacker' u'hawk'
 u'hitman' u'internal' u'islands' u'items' u'jai' u'kgb' u'knife'
 u'loyalty' u'luc' u'manuscript' u'martial' u'millennium' u'missile'
 u'musketeers' u'nevada' u'newest' u'ninjas' u'operatives' u'parker'
 u'paths' u'patrol' u'province' u'robots' u'ruined' u'ruthless' u'safety'
 u'samurai' u'savage' u'scarce' u'sinbad' u'skilled' u'slaves' u'smuggling'
 u'sniper' u'speed' u'strategy' u'struck' u'superhero' u'superman'
 u'superpowers' u'swiftly' u'target' u'terrorists' u'transport' u'trucker'
 u'uss' u'viciously' u'vigil