In [1]:
import os
import random

from ir_system import IrSystem
from movie_description import read_movie_description


In [2]:
corpus = read_movie_description('../Code IR/data/movie.metadata.tsv', '../Code IR/data/plot_summaries.txt')

In [3]:
ir = IrSystem.from_corpus(corpus)

100%|██████████| 42204/42204 [00:11<00:00, 3708.98it/s]
100%|██████████| 42204/42204 [00:13<00:00, 3121.16it/s]


In [4]:
ir.phrase_query('speak during meetings')

[Lord of the Flies]

In [5]:
ir.phrase_query('the plot is')

[Amici miei,
 Wiggle Time,
 Battling with Buffalo Bill,
 Dobrynya Nikitich and Zmey Gorynych,
 A Pleasant Journey,
 Narendra Makan Jayakanthan Vaka,
 La Tía Alejandra,
 Kasoor,
 Tarzan and the Leopard Woman,
 Anne of Green Gables,
 Vanakkam Thalaiva,
 Little Lili,
 Technotise Edit & I,
 Stalingrad,
 India: Kingdom of the Tiger,
 The Marshal of Windy Hollow,
 The Burning Train,
 Yavanika,
 Like a Dragon,
 Heartbreaker,
 Book Revue,
 Acid Factory,
 Bombay Talkie,
 The Butler's in Love,
 Вчера,
 Amen.,
 Libertas,
 Journey to the Beginning of Time,
 Blitz Wolf,
 Quest of the Delta Knights,
 Jakob the Liar,
 Return to Oz,
 Oh My God,
 Everyone's Hero,
 California Dreamin',
 Once Upon a Crime...,
 Mahaul Theek Hai,
 The Mahabharata,
 Blowup,
 Ghost Story,
 Fatal Termination,
 Vinayaka Chaviti,
 The Story of Osaka Castle,
 The Interpreter,
 The 5th Quarter,
 How High,
 Aayushkalam,
 Nights and Days,
 Land of Doom,
 How the Grinch Stole Christmas!,
 Paramanandayya Shishyula Katha,
 Sweeney 2,


In [6]:
print(len(ir.query('yoda')), len(ir.query(
    'luke')), len(ir.query('wars')))

13 161 179


In [7]:
ir.query('boys AND dogs NOT cat')

[Dog Days,
 The Boys from Brazil,
 The Damned,
 Tintin and the Blue Oranges,
 The Sandlot,
 The Dog Who Saved Christmas Vacation,
 Our Idiot Brother,
 Band of the Hand,
 FernGully 2: The Magical Rescue,
 Eden Lake,
 Calling All Curs,
 The Breed,
 The Wild Child,
 The X Files 2,
 Oliver Twist,
 I Was Born But...,
 Simon Says,
 The Hangnail,
 An Inn in Tokyo,
 Men In Fright,
 The Return of a Man Called Horse,
 The Sandlot 2,
 South Park: Bigger, Longer & Uncut,
 Air Buddies,
 Red,
 Sorority Boys,
 Barbie and the Diamond Castle,
 Wilderness,
 Red State,
 Blue Collar Comedy Tour: One For the Road,
 Zombies on Broadway,
 Lottery Ticket,
 The Odd Couple]

In [8]:
ir.query('hello')

[Dark Water,
 House,
 The Jazz Singer,
 A Witch's Tangled Hare,
 Look Who's Talking Too,
 The Life of Reilly,
 Bimbo's Express,
 Hansel and Gretel,
 Phone Booth,
 Queen of the Damned,
 Ghost Town,
 Ghosts,
 Touchstone: Dancing With Angels,
 The Weather Man,
 The Friendly Ghost,
 Say Hello to Yesterday,
 Grave Encounters,
 Duets,
 The Strangers,
 Hello Dolly!,
 Martian Through Georgia,
 The Daffy Doc,
 A Star Is Born,
 Heartlands,
 Crazy People,
 WALL-E,
 A Star Is Born,
 Sweet Home Alabama,
 Takeshis',
 Closer,
 Old School,
 The Nutcracker Prince,
 Jazzin' for Blue Jean,
 Prom Night II,
 The Day After,
 Bugs and Thugs,
 I Could Go On Singing,
 Din of Celestial Birds,
 A Fine Feathered Frenzy,
 One False Move,
 Psychomania,
 Trapped in the Closet Chapters 13–22,
 Easy Money,
 Atlantic Rhapsody,
 Love Bites,
 Tom and Cherie,
 Sleepless in Seattle,
 Yummy Yummy,
 The Adventure of Iron Pussy,
 Tetsuo: The Iron Man,
 An Affair to Remember,
 Motel Hell,
 I'm Here,
 28 Days Later,
 Jerry Magu

In [9]:
# === STEP 1: carica descrizioni ===
descriptions = {}
with open('../Code IR/data/plot_summaries.txt', 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split('\t', 1)
        if len(parts) == 2:
            movie_id, desc = parts
            descriptions[movie_id] = desc

# === STEP 2: carica titoli dal file metadati ===
titles = {}
with open('../Code IR/data/movie.metadata.tsv', 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split('\t')
        if len(parts) >= 3:
            movie_id = parts[0]
            title = parts[2]
            titles[movie_id] = title

# === STEP 3: ID comuni ===
common_ids = list(set(descriptions.keys()) & set(titles.keys()))
random.shuffle(common_ids)

# === STEP 4: dividili in 3 parti ===
n = len(common_ids)
part1_ids = common_ids[:n//3]
part2_ids = common_ids[n//3:2*n//3]
part3_ids = common_ids[2*n//3:]

# === Crea la cartella se non esiste ===
output_dir = "splits"
os.makedirs(output_dir, exist_ok=True)

# === Funzione aggiornata per scrivere file ===
def write_split(filename, ids, data_dict, pad_column=False):
    with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as f:
        for movie_id in ids:
            if pad_column:
                f.write(f"{movie_id}\t\t{data_dict[movie_id]}\n")  # Titolo in posizione 2
            else:
                f.write(f"{movie_id}\t{data_dict[movie_id]}\n")  # Descrizione standard

# === Scrivi descrizioni ===
write_split('descriptions_part1.txt', part1_ids, descriptions)
write_split('descriptions_part2.txt', part2_ids, descriptions)
write_split('descriptions_part3.txt', part3_ids, descriptions)

# === Scrivi titoli con colonna vuota al centro ===
write_split('titles_part1.txt', part1_ids, titles, pad_column=True)
write_split('titles_part2.txt', part2_ids, titles, pad_column=True)
write_split('titles_part3.txt', part3_ids, titles, pad_column=True)

In [10]:
def merge_two_parts(file1, file2, output_file):
    with open(output_file, 'w', encoding='utf-8') as fout:
        for fname in [file1, file2]:
            with open(fname, 'r', encoding='utf-8') as fin:
                for line in fin:
                    fout.write(line)

merge_two_parts('splits/descriptions_part1.txt', 'splits/descriptions_part2.txt', 'splits/descriptionsAB.txt',)
merge_two_parts('splits/titles_part1.txt', 'splits/titles_part2.txt', 'splits/titlesAB.txt')

In [11]:
corpusAB = read_movie_description('splits/titlesAB.txt', 'splits/descriptionsAB.txt')

In [12]:
irAB = IrSystem.from_corpus(corpusAB)

100%|██████████| 28136/28136 [00:11<00:00, 2523.43it/s]
100%|██████████| 28136/28136 [00:16<00:00, 1746.84it/s]


In [13]:
corpusC = read_movie_description('splits/titles_part3.txt', 'splits/descriptions_part3.txt')

In [14]:
len(corpusC)

14066

In [15]:
len(corpusAB)

28136

In [16]:
len(irAB._corpus)

28136

In [17]:
irAB.add_docs(corpusC)

42202it [00:03, 3669.89it/s]
42202it [00:10, 1356.01it/s]


<ir_system.IrSystem at 0x3387655d0>

In [18]:
irAB.phrase_query('inherits a retirement')

[]

In [19]:
len(irAB._corpus)

42202

In [20]:
irAB.delete_docs([i for i in range(14069, 28135)])

<ir_system.IrSystem at 0x3387655d0>

In [21]:
len(corpusAB)

42202

In [22]:
irAB.phrase_query('alien serial killer')

[The Borrower]