# Train Model to Find Movies I Want to Watch

https://github.com/modAL-python/modAL

In [1]:
import os
import re
import urllib
import tarfile
import sqlite3
import spacy
import logging
import sys
import json
import pickle

import pandas as pd
import numpy as np

import modAL
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling, entropy_sampling
from modAL.density import information_density

from sklearn.ensemble import RandomForestClassifier

from IPython.display import display, clear_output, HTML, JSON

## Config

In [2]:
reviews_db = "var/reviews.db"
label_table = "watch_labels"

## Load Data

In [3]:
db = sqlite3.connect(reviews_db)

In [4]:
pd.read_sql("select * from sqlite_master", db)

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,titles,titles,43150,"CREATE TABLE ""titles"" (\n""titleId"" TEXT,\n ""t..."
1,index,i_titles_pk,titles,232137,CREATE UNIQUE INDEX i_titles_pk on titles (tit...
2,table,reviews,reviews,2,"CREATE TABLE ""reviews"" (\n""id"" INTEGER,\n ""ti..."
3,index,i_reviews_pk,reviews,39667,CREATE UNIQUE INDEX i_reviews_pk on reviews (id)
4,index,i_reviews_title_id,reviews,40748,CREATE INDEX i_reviews_title_id on reviews (ti...
5,table,review_embeddings,review_embeddings,41181,"CREATE TABLE ""review_embeddings"" (\n""id"" INTEG..."
6,index,i_review_embeddings_pk,review_embeddings,303627,CREATE UNIQUE INDEX i_review_embeddings_pk on ...
7,table,watch_labels,watch_labels,303908,"CREATE TABLE ""watch_labels"" (\n""id"" INTEGER,\n..."
8,index,ix_watch_labels_id,watch_labels,303909,"CREATE INDEX ""ix_watch_labels_id""ON ""watch_lab..."


In [5]:
reviews = pd.read_sql("""
    select
        r.*,
        e.review_embedding
    from reviews r
    left join review_embeddings e on (e.id = r.id)
""", db)

In [6]:
reviews['review_embedding'] = reviews.review_embedding.apply(lambda et: np.array(json.loads(et)))

In [7]:
reviews.sample(5)

Unnamed: 0,id,titleId,dataset,class,datasetclassId,rating,review,review_embedding
33515,33516,tt0216196,train,neg,8508,1,It was agonizingly bad movie. It will eat your...,"[-0.7727392315864563, 0.3622419536113739, -0.1..."
68044,68045,tt0238883,train,unsup,17923,0,This film is not a spook for Americans. Don't ...,"[-0.38934630155563354, 0.4539123475551605, -0...."
81901,81902,tt0082348,train,unsup,31970,0,"Along with Ken Russell, John Boorman can be se...","[-0.1836564689874649, 0.21428164839744568, 0.0..."
79097,79098,tt0127751,train,unsup,29142,0,"How do these movies get made? Asinine script, ...","[-0.529217004776001, 0.3868107497692108, -0.18..."
76573,76574,tt0015400,train,unsup,26546,0,"""The Thief of Bagdad"" (1924) is an amazing adv...","[-0.280286580324173, 0.3886405825614929, -0.14..."


In [8]:
titles = pd.read_sql("""
    select *
    from titles t
    where
        t.titleId in (
            select titleId
            from reviews
        )
""", db)

In [9]:
titles.sample(5)

Unnamed: 0,titleId,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8618,tt0248808,short,For the Birds,For the Birds,0,2000.0,,3.0,"Animation,Comedy,Family"
288,tt0022614,movie,The Age of Consent,The Age of Consent,0,1932.0,,63.0,"Drama,Romance"
5893,tt0111931,tvMiniSeries,Crusades,Crusades,0,1995.0,1995.0,200.0,"Documentary,History,War"
7170,tt0144555,movie,Premutos: The Fallen Angel,Premutos - Der gefallene Engel,0,1997.0,,106.0,"Comedy,Horror"
9853,tt0331509,movie,Funky Monkey,Funky Monkey,0,2004.0,,94.0,"Comedy,Family"


In [10]:
reviews = pd.merge(
    reviews,
    titles[['titleId', 'titleType', 'primaryTitle', 'genres', 'startYear']],
    on=['titleId'],
    how = 'left'
)

In [11]:
reviews.titleType.value_counts().to_frame()

Unnamed: 0,titleType
movie,74832
tvMovie,5792
video,4863
tvSeries,4134
short,1682
tvMiniSeries,1469
tvEpisode,1236
videoGame,385
tvSpecial,320
tvShort,60


In [12]:
# I want to limit this to movies
reviews = reviews[reviews.titleType.isin({'movie', 'tvMovie'})]

In [13]:
len(reviews)

80624

In [14]:
reviews = reviews.set_index('id')

## Active Learning Code Modules

### Label Management

Responsible for updating and retrieving labels.

In [20]:
class LabelManagement:
    
    def __init__(self, db, label_table, reviews):
        self.db = db
        self.label_table = label_table
        self.reviews = reviews
        self.initialize()
        
    def initialize(self):
        self.initialize_label_table()
        self.load_labels()
        self.update_pool_and_training()
        
    def clear_labels(self):
        self.db.execute(f"delete from {self.label_table}")
        self.db.commit()
        self.update_pool_and_training()
        
    def initialize_label_table(self):
        lbl_table_count = pd.read_sql(
            "select count(*) as Count from sqlite_master where type = 'table' and name = ?",
            self.db,
            params=[self.label_table]
        ).iloc[0].Count

        if lbl_table_count < 1:
            self.db.execute(f"""
                create table {self.label_table} (
                    id varchar(100) not null primary key,
                    wanted int not null
                )
            """)
            self.db.commit()
         
    def save_labels(self):
        self.labels.to_sql(label_table, db, if_exists = 'replace')
        self.db.execute(f"""
            create unique index i_{self.label_table}_pk on {self.label_table} (id)
        """)
        self.db.commit()
        
    def load_labels(self):
        self.labels = pd.read_sql(f"select * from {self.label_table}", self.db)
        self.labels = self.labels.set_index('id')
        return self.labels
        
    def update_pool_and_training(self):
        labelled_ids = set(self.labels.index.values)
        self.pool = self.reviews[~self.reviews.index.isin(labelled_ids)]
        self.train = self.reviews[self.reviews.index.isin(labelled_ids)]
        
        self.x_pool = np.vstack(self.pool.review_embedding)
        self.x_pool_ids = self.pool.index.values
        
        if len(self.train) > 0:
            self.x_train = np.vstack(self.train.review_embedding)
            self.x_train_ids = self.train.index.values
            self.y_train = self.labels.loc[self.x_train_ids].wanted.values
        else:
            self.x_train = None
            self.x_train_ids = None
            self.y_train = None
        
    def add_labels(self, idxs, results):
        self.labels = pd.concat([self.labels, pd.DataFrame({'id': self.x_pool_ids[idxs], 'wanted': results}).set_index('id')])
        self.update_pool_and_training()
        
lbl_manager = LabelManagement(db, label_table, reviews)

In [21]:
lbl_manager.labels

Unnamed: 0_level_0,wanted
id,Unnamed: 1_level_1


In [22]:
# Run the line below to clear out previous labels

#lbl_manager.clear_labels()

### LabelUI

Responsible for handling basic input interactions with the user to obtain labels.

In [23]:
class LabelUI:
    def __init__(self, lbl_manager):
        self.lbl_manager = lbl_manager
        
    def query_wanted(self, pool_idx):
        while True:
            #clear_output()
            r_id = self.lbl_manager.x_pool_ids[pool_idx]
            r = self.lbl_manager.reviews.loc[r_id]
            print(r.review)
            print("\n")
            print("Do you want to watch this? [y/n]")

            ans = input().lower().strip()
            if ans == 'y':
                return 1
            elif ans == 'n':
                return 0

    def query_batch(self, idxs):
        results = [self.query_wanted(idx) for idx in idxs]
        x = self.lbl_manager.x_pool[idxs]
        self.lbl_manager.add_labels(idxs, results)
        return (x, idxs, results)

    def query_random(self, n):
        idxs = pd.Series(range(0, len(self.lbl_manager.x_pool_ids))).sample(n).values
        return self.query_batch(idxs)

lbl_ui = LabelUI(lbl_manager)

### ActiveLearnerDriver

Responsible for putting the label management and UI together to drive the overall experience of labelling.

In [27]:
class ActiveLearnerDriver:
    
    def __init__(self, learner, lbl_ui):
        self.lbl_ui = lbl_ui
        self.learner = learner
        
    def query_random(self, n):
        lbl_ui.query_random(n)
        lbl_manager = self.lbl_ui.lbl_manager
        learner.teach(lbl_manager.x_train, lbl_manager.y_train.astype(bool))
        lbl_manager.save_labels()
        
    def query_n(self, n):
        lbl_manager = self.lbl_ui.lbl_manager
        for round in range(0, n):
            query_idx, query_inst = learner.query(lbl_manager.x_pool)
            print(f"idx: {query_idx}")
            x, ids, results = lbl_ui.query_batch(query_idx)
            learner.teach(x, np.array(results).astype(bool), only_new=False)
        lbl_manager.save_labels()

## Active Learner Setup

Create and setup the active learner.

In [28]:
def sampling_factory(al, x):
    return entropy_sampling(al, x, n_instances=1, random_tie_break=True)
    
model = RandomForestClassifier()

learner = ActiveLearner(
    estimator = model,
    query_strategy = sampling_factory
)

al_driver = ActiveLearnerDriver(learner, lbl_ui)

## Get Labels
The cell below well figure out which label to get next, prompt you for a label, and save the results.

In [259]:
al_driver.query_n(1)

idx: [10746]
Why couldn't the end of the movie have been Sean Connery's men fighting the French instead of the Germans. Ever since the French had occupied Algeria in 1830, the tribes from Morocco and those of Algeria were making raids on the French military and civilian settlements. This movie could have been a continuous of that historical aspect where the French had seize the Rasuadli so his followers would not be raiding Algeria, and then his followers would have attacked the French to free him.<br /><br />The movie is still stereotypical of shootouts between the Germans and the Americans. When the Americans shoot the Germans, their guns (even the pistols) make loud noises, create large bloody bullet wounds, and their enemies are screaming after being shot. When Germans shoot at the Americans, their guns don't make large sounds, do not create bloody wounds, and their enemies make little or no sound after being shot.<br /><br />In real life, the American Krag rifle was the worst rifl

 n


In [282]:
# As you label you can see the feature importances change
model.feature_importances_

array([0.01355946, 0.02373175, 0.02910919, 0.03849558, 0.02699647,
       0.01590186, 0.00718273, 0.01536253, 0.01551587, 0.01131622,
       0.02285567, 0.02246399, 0.02100886, 0.01961654, 0.03957821,
       0.0201633 , 0.01419334, 0.00970113, 0.02099284, 0.00951679,
       0.01578759, 0.01269281, 0.01086385, 0.01494053, 0.01931959,
       0.01496078, 0.01366419, 0.02176921, 0.01315825, 0.01411886,
       0.02397039, 0.02360592, 0.03394227, 0.01255319, 0.0110483 ,
       0.01911003, 0.01683718, 0.02444243, 0.01915044, 0.01719118,
       0.01454655, 0.03425428, 0.02947425, 0.01121916, 0.01335979,
       0.00820798, 0.01313291, 0.07836225, 0.01848191, 0.02857154])

In [283]:
# See how many labels we have in the db
pd.read_sql(f"select count(*) from {label_table}", db)

Unnamed: 0,count(*)
0,100


### Get Predictions

Using the model, get predictions about a review being compelling.

In [288]:
review_p = model.predict_proba(np.vstack(lbl_manager.reviews.review_embedding)).T
reviews_wp = lbl_manager.reviews.copy()
reviews_wp['bad_p'] = review_p[0]
reviews_wp['good_p'] = review_p[1]

reviews_wp = pd.merge(
    reviews_wp,
    lbl_manager.labels,
    how = 'left',
    left_index = True,
    right_index = True
)

reviews_wp = reviews_wp.sort_values(by='good_p', ascending=False)

### Show top 20 movies to watch which weren't explicitlyl labelled.

In [291]:
reviews_wp[reviews_wp.wanted.isnull()][['primaryTitle', 'rating', 'genres', 'startYear', 'good_p', 'review']].head(20)

Unnamed: 0_level_0,primaryTitle,rating,genres,startYear,good_p,review
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
48802,Citizen X,10,"Biography,Crime,Drama",1995.0,0.81,CitizenX(1995) is the developing world's answe...
98931,The Gypsy Warriors,0,"Comedy,Drama,War",1978.0,0.81,"In this uninspired, made-for-television, buddy..."
20533,Cannon for Cordoba,8,"Action,Romance,Western",1970.0,0.8,I have reasons to love the great users of a ca...
21937,Crossfire,9,"Crime,Drama,Film-Noir",1947.0,0.78,"Taut and organically gripping, Edward Dmytryk'..."
44524,Winchester '73,9,"Action,Drama,Western",1950.0,0.78,The larger-than-life figures of Wyatt Earp and...
76484,Apocalypto,0,"Action,Adventure,Drama",2006.0,0.76,"The Mel Gibson movie ""Apocalypto"" ranks as one..."
12835,Apocalypse Now,10,"Drama,Mystery,War",1979.0,0.76,Movies seem to fall into two categories: films...
61517,The Red Tent,0,"Adventure,Drama",1969.0,0.76,atching THE RED TENT gave me that rare fulfill...
53442,1968 Tunnel Rats,0,"Action,Drama,War",2008.0,0.76,What am I supposed to say about a war film mad...
53763,The Vanishing,0,"Mystery,Thriller",1988.0,0.76,George Sluizer's magnificent European thriller...


In [294]:
# Show top 20 whether explictlyl labelled or not
reviews_wp[['primaryTitle', 'rating', 'genres', 'startYear', 'good_p', 'review', 'wanted']].head(20)

Unnamed: 0_level_0,primaryTitle,rating,genres,startYear,good_p,review,wanted
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
50171,Ring 0: Birthday,0,"Horror,Mystery,Thriller",2000.0,0.9,"In Jôji Iida's ""Rasen"", Sadako explained that ...",1
15093,2001: A Space Odyssey,10,"Adventure,Sci-Fi",1968.0,0.9,2001: A Space Odyssey <br /><br />Is it a serm...,1
68384,Somewhere in the Night,0,"Crime,Drama,Film-Noir",1946.0,0.89,Borrowed as the title of Nicholas Christopher'...,1
80261,Puzzle,0,"Mystery,Romance,Thriller",1974.0,0.87,"Duccio Tessari's ""L'Uomo Senza Memoria"" (aka. ...",1
63658,Compañeros,0,"Action,Comedy,Western",1970.0,0.87,"""Vamos A Matar, Compañeros"" is another terrifi...",1
76940,Kiss of the Dragon,0,"Action,Crime,Thriller",2001.0,0.86,KISS OF THE DRAGON / (2001) *** (out of four) ...,1
61200,Widow of St. Pierre,0,"Drama,History,Romance",2000.0,0.86,Patrice LeConte colors this intelligent film i...,1
73552,Breaking and Entering,0,"Crime,Drama,Romance",2006.0,0.85,"Jude Law plays Will, an architect who lives wi...",1
69268,Tears of the Black Tiger,0,"Action,Comedy,Romance",2000.0,0.84,Tears of the Black Tiger is one of those films...,1
72316,Outlander,0,"Action,Adventure,Sci-Fi",2008.0,0.84,"""Outlander"" is a great surprise by director Ho...",1


In [297]:
# Show worst 10
reviews_wp[['primaryTitle', 'rating', 'genres', 'startYear', 'good_p', 'review', 'wanted']].tail(10)

Unnamed: 0_level_0,primaryTitle,rating,genres,startYear,good_p,review,wanted
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
77348,A Return to Salem's Lot,0,"Comedy,Horror,Thriller",1987.0,0.03,It's hard to put into words just how bad the a...,
19475,The Rain People,9,Drama,1969.0,0.02,Newly-pregnant Knight bolts from husband for n...,
77830,Twin Town,0,"Comedy,Drama",1997.0,0.02,Dougray Scott has appeared in over a dozen mov...,
31711,Gentlemen Prefer Blondes,4,"Comedy,Musical,Romance",1953.0,0.02,"Evidently lots of people really like this, but...",
97751,The Anderson Tapes,0,"Action,Crime,Thriller",1971.0,0.02,Veteran director Sidney Lumet has made many ab...,
10937,The Quick and the Undead,1,"Action,Horror,Western",2006.0,0.02,"Firstly, I am a huge fan of crap films. B grad...",
27158,The Groove Tube,2,Comedy,1974.0,0.02,"After two long, long opening skits, one of whi...",
84530,200 Cigarettes,0,"Comedy,Drama,Romance",1999.0,0.02,"Many critics thought the film ""200 Cigarettes""...",
89173,Sextette,0,"Comedy,Musical,Romance",1977.0,0.02,"Okay, so I'm a big movie buff & I just HAD to ...",
96420,Kiss Daddy Goodnight,0,Thriller,1987.0,0.02,This truly obscure neo-noir from 1988 is memor...,


In [308]:
# Wanted to see how genres relate to chance of wanting to watch.

genres_good_p = (reviews_wp
    .groupby('genres')
    .agg({'good_p': ['mean', len]})
    .reset_index()
    .sort_values(by=[('good_p', 'mean')], ascending=False))

genres_good_p[genres_good_p[('good_p', 'len')] >= 20].head(20)

Unnamed: 0_level_0,genres,good_p,good_p
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,len
475,"Drama,Mystery,War",0.398,30
152,"Adventure,Crime,Horror",0.38641,39
145,"Adventure,Comedy,War",0.378,30
567,"Thriller,War",0.371154,52
373,"Crime,Film-Noir,Romance",0.354333,30
518,"Film-Noir,Horror,Thriller",0.352667,30
408,"Documentary,History,War",0.350476,21
520,"Film-Noir,Mystery,Thriller",0.345152,66
484,"Drama,Sci-Fi,War",0.341667,30
431,"Drama,Fantasy,History",0.339667,30


# Save Results

In [311]:
# Save results to csv.

movies_to_watch = reviews_wp[['titleId', 'primaryTitle', 'startYear', 'genres', 'review', 'good_p', 'rating']]
movies_to_watch.to_csv("var/movies_wanted.csv")

# Save Model

In [321]:
with open("var/models/wanted_review.pkl", "wb") as fh:
    pickle.dump(model, fh)