In [None]:
from functools import reduce
import re
import csv
from tqdm import tqdm
from BTrees.OOBTree import OOBTree

In [None]:
def normalize(text):
    """Remove punctuation and convert text to lowercase"""
    return re.sub(r'[^\w\s^-]', '', text).lower()


def tokenize(content) -> list:
    """Split normalized text into tokens"""
    return normalize(content).split()


class InvertedIndex:

    def __init__(self) -> None:
        self.btree = OOBTree()  # usa un Btree per rendere piu' veloci aggiornamenti dell'indice

    @classmethod
    def from_corpus(cls, corpus, max_size=0) -> 'InvertedIndex':
        terms = {}  # dizionario temporaneo per tenere l'indice iniziale
        # per ogni documento
        for doc_id, content in enumerate(tqdm(corpus, total=max_size or None)):
            # crea un set dei termini che contiene
            tokens = set(tokenize(content.description))
            for token in tokens:  # per ogni termine
                plist = PostingsList.from_doc_id(doc_id)
                if token in terms:  # se contenuto
                    terms[token].merge(plist)  # fai merge delle PostingsList
                else:  # altrimenti aggiungi
                    terms[token] = plist
        idx = cls()
        idx.btree.update(terms)
        return idx

    # crea il biword index per le phrase queries
    @classmethod
    def from_corpus_biword(cls, corpus, max_size=0) -> 'InvertedIndex':
        terms = {}
        # per ogni documento
        for doc_id, content in enumerate(tqdm(corpus, total=max_size or None)):
            tokens = tokenize(content.description)
            # per ogni parola
            for i in range(len(tokens) - 1):
                biword = tokens[i]+tokens[i+1]
                plist = PostingsList.from_doc_id(doc_id)
                if biword in terms:
                    terms[biword].merge(plist)
                else:
                    terms[biword] = plist
        idx = cls()
        idx.btree.update(terms)
        return idx

    def merge(self, other: 'InvertedIndex') -> 'InvertedIndex':
        for term, postings in other.btree.items():
            if term in self.btree:
                self.btree[term].merge(postings)
            else:
                self.btree[term] = postings
        return self

    def __getitem__(self, key: str) -> PostingsList:
        return self.btree[key]

    def __len__(self) -> int:
        return len(self.btree)

    def __repr__(self) -> str:
        return self.btree

In [67]:
corpus = read_movie_description(
    '../Code IR/data/movie.metadata.tsv', '../Code IR/data/plot_summaries.txt')

In [71]:
ir = IrSystem.from_corpus(corpus)

  0%|          | 0/42204 [00:00<?, ?it/s]

100%|██████████| 42204/42204 [00:04<00:00, 8917.93it/s]
100%|██████████| 42204/42204 [00:35<00:00, 1172.88it/s]


In [72]:
ir.phrase_query('speak during meetings')

[Lord of the Flies]

In [73]:
print(len(ir.query('yoda')), len(ir.query(
    'luke')), len(ir.query('wars')))

13 161 179


In [74]:
ir.query('luke')

[Afghan Luke,
 Daisy Town,
 Decoys 2: Alien Seduction,
 Out Cold,
 2:37,
 Lilies of the Field,
 Scumbus,
 Death of a Gunfighter,
 Fatty and Mabel Adrift,
 Santa Baby,
 The Boys Club,
 SpaceCamp,
 Undiscovered,
 Fast Five,
 Star Wars Episode V: The Empire Strikes Back,
 Dual,
 Angels and Demons,
 Children of Men,
 Spiderhole,
 Spike and Suzy: The Texas Rangers,
 Children of the Corn V: Fields of Terror,
 Stagecoach,
 Animal Kingdom,
 The Prince of Tides,
 The Dukes of Hazzard: Reunion!,
 Vanishing on 7th Street,
 Green Light,
 Still Crazy,
 Coming Home,
 Decoys,
 Halloween Resurrection,
 Imaginationland Episode II,
 Slaves,
 Jennifer,
 Nagarangalil Chennu Raparkam,
 Star Wars Episode IV: A New Hope,
 Memphis Belle,
 Wishology,
 The Wendell Baker Story,
 The Little Troll Prince: A Christmas Parable,
 Mustang Country,
 Macon County Line,
 The Long Kiss Goodnight,
 The Dukes of Hazzard: Hazzard in Hollywood!,
 A Woman's Secret,
 No Name on the Bullet,
 Tanner on Tanner,
 The Toy that Saved

In [77]:
ir.query('afghanistan')

[Getting Even,
 Agent Vinod,
 Afghan Luke,
 Brothers,
 Charlie Wilson's War,
 The Veteran,
 Afghan Breakdown,
 Iron Man,
 Summer Heat,
 New Year's Eve,
 The Minion,
 The Storm,
 The Objective,
 Zombie Strippers,
 Outlaw,
 If I Should Fall,
 Dharmatma,
 The Christmas Card,
 The Boy Mir,
 The Hard Corps,
 Main Osama,
 The Whistleblower,
 Where in the World is Osama Bin Laden?,
 All Costs Paid,
 Kabul Express,
 16 Days in Afghanistan,
 Armadillo,
 Fire Creek,
 Kabuliwala,
 Keerthi Chakra,
 Rambo,
 Aegan,
 9th Company,
 Netaji Subhas Chandra Bose: The Forgotten Hero,
 Qayamat - A Love Triangle In Afghanistan,
 The Kite Runner,
 Beyond the Call,
 Stealing a Nation,
 Lions for Lambs,
 Afghan Massacre - the Convoy of Death,
 Afghan Muscles,
 Rambo III,
 Homeland Security,
 Kim,
 Savages,
 My Name is Khan,
 The Great Mouse Detective,
 Watchmen,
 Brothers,
 DC 9/11: Time of Crisis,
 Khamosh Pani,
 Fahrenheit 9/11,
 La Linea,
 Baran,
 Mission Istanbul,
 Enchantment,
 Osama,
 25 Hill,
 The best M