# Train Model to Find Movies I Want to Watch

https://github.com/modAL-python/modAL

In [1]:
import os
import re
import urllib
import tarfile
import sqlite3
import spacy
import logging
import sys
import json

import pandas as pd
import numpy as np

from modAL.models import ActiveLearner
from sklearn.ensemble import RandomForestClassifier

from IPython.display import display, clear_output, HTML, JSON

## Config

In [2]:
reviews_db = "var/reviews.db"
label_table = "watch_labels"

## Load Data

In [3]:
db = sqlite3.connect(reviews_db)

In [4]:
pd.read_sql("select * from sqlite_master", db)

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,titles,titles,43150,"CREATE TABLE ""titles"" (\n""titleId"" TEXT,\n ""t..."
1,index,i_titles_pk,titles,232137,CREATE UNIQUE INDEX i_titles_pk on titles (tit...
2,table,reviews,reviews,2,"CREATE TABLE ""reviews"" (\n""id"" INTEGER,\n ""ti..."
3,index,i_reviews_pk,reviews,39667,CREATE UNIQUE INDEX i_reviews_pk on reviews (id)
4,index,i_reviews_title_id,reviews,40748,CREATE INDEX i_reviews_title_id on reviews (ti...
5,table,review_embeddings,review_embeddings,41181,"CREATE TABLE ""review_embeddings"" (\n""id"" INTEG..."
6,index,i_review_embeddings_pk,review_embeddings,303627,CREATE UNIQUE INDEX i_review_embeddings_pk on ...
7,table,watch_labels,watch_labels,303908,CREATE TABLE watch_labels (\n id va...
8,index,sqlite_autoindex_watch_labels_1,watch_labels,303909,


In [5]:
reviews = pd.read_sql("""
    select
        r.*,
        e.review_embedding
    from reviews r
    left join review_embeddings e on (e.id = r.id)
""", db)

In [6]:
reviews['review_embedding'] = reviews.review_embedding.apply(lambda et: np.array(json.loads(et)))

In [7]:
reviews.sample(5)

Unnamed: 0,id,titleId,dataset,class,datasetclassId,rating,review,review_embedding
72751,72752,tt0094580,train,unsup,22688,0,4 comedians play various improv games for the ...,"[-0.21923433244228363, 0.5974705219268799, 0.2..."
46507,46508,tt0132477,train,pos,9040,9,A true wholesome American story about teenager...,"[-0.6931142807006836, 0.1616278439760208, -0.3..."
49791,49792,tt0095904,train,pos,12412,9,What we have here is a compelling piece of low...,"[0.14578938484191895, 1.3434680700302124, -1.0..."
36539,36540,tt0090840,train,neg,11628,4,"Wow You guys are way too nice!!!Corny,Corny,Co...","[-0.38402360677719116, 0.17765586078166962, 0...."
42533,42534,tt0375878,train,pos,5078,9,This one stood out for it's originality. I'm s...,"[-0.44287145137786865, 0.26707732677459717, 0...."


In [8]:
titles = pd.read_sql("""
    select *
    from titles t
    where
        t.titleId in (
            select titleId
            from reviews
        )
""", db)

In [9]:
titles.sample(5)

Unnamed: 0,titleId,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
4253,tt0091419,movie,Little Shop of Horrors,Little Shop of Horrors,0,1986.0,,94.0,"Comedy,Horror,Musical"
10621,tt0383788,tvSeries,I Love the '80s Strikes Back,I Love the '80s Strikes Back,0,2003.0,,60.0,"Comedy,Documentary"
10558,tt0380687,movie,Satan's Little Helper,Satan's Little Helper,0,2004.0,,100.0,"Comedy,Horror"
12636,tt0787515,movie,Strength and Honour,Strength and Honour,0,2007.0,,95.0,"Action,Drama"
12797,tt0815241,movie,Religulous,Religulous,0,2008.0,,101.0,"Comedy,Documentary,War"


In [10]:
reviews = pd.merge(
    reviews,
    titles[['titleId', 'titleType', 'primaryTitle', 'genres', 'startYear']],
    on=['titleId'],
    how = 'left'
)

In [11]:
reviews.titleType.value_counts().to_frame()

Unnamed: 0,titleType
movie,74832
tvMovie,5792
video,4863
tvSeries,4134
short,1682
tvMiniSeries,1469
tvEpisode,1236
videoGame,385
tvSpecial,320
tvShort,60


In [12]:
# I want to limit this to movies
reviews = reviews[reviews.titleType.isin({'movie', 'tvMovie'})]

In [13]:
len(reviews)

80624

In [14]:
reviews = reviews.set_index('id')

## Do Active Learning

In [48]:
class LabelManagement:
    
    def __init__(self, db, label_table, reviews):
        self.db = db
        self.label_table = label_table
        self.reviews = reviews
        self.initialize()
        
    def initialize(self):
        self.initialize_label_table()
        self.load_labels()
        self.update_pool_and_training()
        
    def initialize_label_table(self):
        lbl_table_count = pd.read_sql(
            "select count(*) as Count from sqlite_master where type = 'table' and name = ?",
            self.db,
            params=[self.label_table]
        ).iloc[0].Count

        if lbl_table_count < 1:
            self.db.execute(f"""
                create table {self.label_table} (
                    id varchar(100) not null primary key,
                    wanted int not null
                )
            """)
            self.db.commit()
         
    def save_labels(self):
        self.labels.to_sql(label_table, db, if_exists = 'replace')
        self.db.execute(f"""
            create unique index i_{self.label_table}_pk on {self.label_table} (id)
        """)
        self.db.commit()
        
    def load_labels(self):
        self.labels = pd.read_sql(f"select * from {self.label_table}", self.db)
        self.labels = self.labels.set_index('id')
        return self.labels
        
    def update_pool_and_training(self):
        labelled_ids = set(self.labels.index.values)
        self.pool = self.reviews[~self.reviews.index.isin(labelled_ids)]
        self.train = self.reviews[self.reviews.index.isin(labelled_ids)]
        
        self.x_pool = np.vstack(self.pool.review_embedding)
        self.x_pool_ids = self.pool.index.values
        
        if len(self.train) > 0:
            self.x_train = np.vstack(self.train.review_embedding)
            self.x_train_ids = self.train.index.values
            self.y_train = self.labels.loc[self.x_train_ids].wanted.values
        else:
            self.x_train = None
            self.x_train_ids = None
            self.y_train = None
        
    def add_labels(self, idxs, results):
        self.labels = pd.concat([self.labels, pd.DataFrame({'id': self.x_pool_ids[idxs], 'wanted': results}).set_index('id')])
        self.update_pool_and_training()
        
lbl_manager = LabelManagement(db, label_table, reviews)

In [49]:
class LabelUI:
    def __init__(self, lbl_manager):
        self.lbl_manager = lbl_manager
        
    def query_wanted(self, pool_idx):
        while True:
            clear_output()
            r_id = self.lbl_manager.x_pool_ids[pool_idx]
            r = self.lbl_manager.reviews.loc[r_id]
            print(r.review)
            print("\n")
            print("Do you want to watch this? [y/n]")

            ans = input().lower().strip()
            if ans == 'y':
                return 1
            elif ans == 'n':
                return 0

    def query_batch(self, idxs):
        results = [self.query_wanted(idx) for idx in idxs]
        x = self.lbl_manager.x_pool[idxs]
        self.lbl_manager.add_labels(idxs, results)
        return (x, idxs, results)

    def query_random(self, n):
        idxs = pd.Series(range(0, len(self.lbl_manager.x_pool_ids))).sample(n).values
        return self.query_batch(idxs)

lbl_ui = LabelUI(lbl_manager)

In [50]:
class ActiveLearnerDriver:
    
    def __init__(self, learner, lbl_ui):
        self.lbl_ui = lbl_ui
        self.learner = learner
        
    def query_random(self, n):
        lbl_ui.query_random(n)
        learner.teach(lbl_manager.x_train, lbl_manager.y_train.astype(bool))
        
    def query_n(self, n):
        query_idx, query_inst = learner.query(lbl_manager.x_pool)
        print(f"idx: {query_idx}")
        x, ids, results = lbl_ui.query_batch(query_idx)
        learner.teach(x, np.array(results).astype(bool), only_new=False)

In [17]:
lbl_ui.query_random(3)
pass

Haunting, taut and dark is this story of conjoined twins played by real life identical twins Michael and Mark Polish. Michael directs and shares writing credit with Mark. These shy conjoined twins temporarily are living in a run down hotel as they summon the courage to reunite with their estranged mother(Lesley Ann Warren). A homeless hooker named Penny(Michele Hick)finds herself befriending the Falls brothers. They wander outside the apartment to attend a Halloween party with Penny pretending to be wearing a "Siamese Twin" costume. Blake(Mark Polish)tends to his sick brother Francis(Michael Polish)and tries to interpret his mixed feelings toward the thought of losing his sibling. Then to complicate matters more is the relationship developing between himself and Penny. Albeit a strange story, it is well acted. Others in the cast: Patrick Bauchau, Garrett Morris, William Katt and Teresa Hill.


Do you want to watch this? [y/n]


 n


In [51]:
model = RandomForestClassifier()

learner = ActiveLearner(
    estimator = model,
    X_training = lbl_manager.x_train, y_training=lbl_manager.y_train.astype(bool)
)

al_driver = ActiveLearnerDriver(learner, lbl_ui)

In [52]:
al_driver.query_random(3)

I'd always wanted David Duchovney to go into the movie business, and finally he did, and he made me proud. This movie lived up to what I had hoped for. Duchovney played his character very well, managing to remain consistent with something new, instead of playing the Agent Molder we are used to. Therefore, I give him extra credit for his role, also because I could not see anyone else playing that particular character. David was great, but nothing compared to the psychotic Timothy Hutton. A brilliant performance that you don't get tired of throughout the movie, because he never fails to surprise you. He has weaknesses, and strengths, making the story all the more believable. I also very much enjoyed the narration, it added to the story a good deal, and had some very memorable quotes that i still use to all the time. This movie also had a wounderfull score. I recomend this for anyone who likes drama, and doesn't mind blood.


Do you want to watch this? [y/n]


 y


In [56]:
al_driver.query_n(1)

No kind words come to mind looking back on whatever it was I just fast-forwarded. And I just can't believe anyone would write kindly about it which was the reason I gave this flick a shot. Silly me ey?<br /><br />And just to counter the obvious statements in the "worth a watch" section; it being low budget doesn't give it merit by default. Sewage is sewage and a 100$ turd reeks just as much as one in the lower pricerange.<br /><br />But...something that is worth a watch is also worth a telling, right? so allow me to just sum up this movie for u then: (spoiler, but please be spoiled)<br /><br />-Guy goes to a mountain shed in the snow to undergo an experiment with a doctor, guy gets killed, guy gets revived but now his eyes look exactly like those of master Poo in Kung Fu, guy gets killed another 9 times, gets revived 9 more times and then he walks back down the mountain. Oh..and the dead guy in the snow turned out to be the real doctor.-<br /><br />Honestly folks:..thazzall...<br /><br

 n


In [57]:
lbl_manager.labels

Unnamed: 0_level_0,wanted
id,Unnamed: 1_level_1
69421,0
57630,0
63484,0
12,0
13,0
18,0
19,0
29049,0
77172,0
49992,1


In [47]:
query_idx, query_inst = learner.query(lbl_manager.x_pool)
query_idx

array([0], dtype=int64)

In [34]:
x, ids, results = lbl_ui.query_batch(query_idx)

Ben, (Rupert Grint), is a deeply unhappy adolescent, the son of his unhappily married parents. His father, (Nicholas Farrell), is a vicar and his mother, (Laura Linney), is ... well, let's just say she's a somewhat hypocritical soldier in Jesus' army. It's only when he takes a summer job as an assistant to a foul-mouthed, eccentric, once-famous and now-forgotten actress Evie Walton, (Julie Walters), that he finally finds himself in true 'Harold and Maude' fashion. Of course, Evie is deeply unhappy herself and it's only when these two sad sacks find each other that they can put their mutual misery aside and hit the road to happiness.<br /><br />Of course it's corny and sentimental and very predictable but it has a hard side to it, too and Walters, who could sleep-walk her way through this sort of thing if she wanted, is excellent. It's when she puts the craziness to one side and finds the pathos in the character, (like hitting the bottle and throwing up in the sink), that she's at her b

 n


In [35]:
learner.teach(x, np.array(results).astype(bool), only_new=False)

In [45]:
pd.read_sql(f"select * from {label_table}", db)

Unnamed: 0,id,wanted
0,69421,0
1,57630,0
2,63484,0
3,12,0
4,13,0
5,18,0
6,19,0


In [44]:
lbl_manager.labels.to_sql(label_table, db, if_exists = 'replace')

In [41]:
help(lbl_manager.labels.to_sql)

Help on method to_sql in module pandas.core.generic:

to_sql(name: 'str', con, schema=None, if_exists: 'str' = 'fail', index: 'bool_t' = True, index_label=None, chunksize=None, dtype: 'DtypeArg | None' = None, method=None) -> 'None' method of pandas.core.frame.DataFrame instance
    Write records stored in a DataFrame to a SQL database.
    
    Databases supported by SQLAlchemy [1]_ are supported. Tables can be
    newly created, appended to, or overwritten.
    
    Parameters
    ----------
    name : str
        Name of SQL table.
    con : sqlalchemy.engine.(Engine or Connection) or sqlite3.Connection
        Using SQLAlchemy makes it possible to use any DB supported by that
        library. Legacy support is provided for sqlite3.Connection objects. The user
        is responsible for engine disposal and connection closure for the SQLAlchemy
        connectable See `here                 <https://docs.sqlalchemy.org/en/13/core/connections.html>`_.
    
    schema : str, optional
  

In [None]:


# query for labels
query_idx, query_inst = learner.query(X_pool)

# ...obtaining new labels from the Oracle...

# supply label for queried instance
learner.teach(X_pool[query_idx], y_new)