In [1]:
import time
import docker
from psycopg import connect
from psycopg.rows import dict_row
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm

PG_IMG   = "pgvector/pgvector:pg16"
PG_NAME  = "pgv-demo"
PG_PORT  = 5431            # порт на хосте
PG_USER  = "admin"
PG_PASS  = "secret"
PG_DB    = "testdb"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
client = docker.from_env()

# если контейнер уже есть - удалим
try:
    client.containers.get(PG_NAME).remove(force=True)
except docker.errors.NotFound:
    pass

container = client.containers.run(
    PG_IMG,
    name=PG_NAME,
    environment={
        "POSTGRES_USER": PG_USER,
        "POSTGRES_PASSWORD": PG_PASS,
        "POSTGRES_DB": PG_DB
    },
    ports={"5432/tcp": PG_PORT},
    detach=True,
    remove=False
)

In [6]:
container.exec_run(
    f'psql -h localhost -U {PG_USER} -d {PG_DB} '
    '-c "CREATE EXTENSION IF NOT EXISTS vector;"',
    user="postgres"
)
print("Расширения установлены")

Расширения установлены


In [4]:
import nltk
import re

nltk.download('movie_reviews')
from nltk.corpus import movie_reviews

fileids = movie_reviews.fileids()

def tidy(s: str) -> str:
    s = " ".join(s.split())                          # схлопнуть все виды пробелов/переводы строк
    s = re.sub(r"\s+([.,;:!?…])", r"\1", s)          # убрать пробелы ПЕРЕД . , ; : ! ? …
    s = re.sub(r"([(\[{«“])\s+", r"\1", s)           # убрать пробел ПОСЛЕ ( [ { « “
    s = re.sub(r"\s+([)\]}»”])", r"\1", s)           # убрать пробел ПЕРЕД ) ] } » ”
    return s.strip()[:500]

texts = [tidy(movie_reviews.raw(fid)) for fid in fileids]

texts = texts[:100]
print("Документов:", len(texts), "\nПример:", texts[2][:50].replace("\n"," ") + "…")

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/shcher/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Документов: 100 
Пример: it is movies like these that make a jaded movie vi…


# TF_IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vectorizer = TfidfVectorizer()
vectorizer.fit(texts)
EMB_DIM = len(vectorizer.vocabulary_)

DDL = f"""
DROP TABLE IF EXISTS docs;
CREATE TABLE docs (
  id   BIGSERIAL PRIMARY KEY,
  text TEXT NOT NULL,
  emb  VECTOR({EMB_DIM}) NOT NULL
);
"""
with connect(host="localhost", port=PG_PORT, user=PG_USER,
             password=PG_PASS, dbname=PG_DB) as con:
    with con.cursor() as cur:
        cur.execute(DDL)
        con.commit()

X_tfidf = vectorizer.fit_transform(texts)
embs = np.vstack(X_tfidf.toarray())
print(embs.shape)

rows = [(texts[i], embs[i].tolist()) for i in range(len(texts))]
with connect(host="localhost", port=PG_PORT, user=PG_USER,
             password=PG_PASS, dbname=PG_DB) as con:
    with con.cursor() as cur:
        cur.executemany(
            "INSERT INTO docs (text, emb) VALUES (%s, %s)",
            rows
        )
        con.commit()

(100, 2820)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

def search_tfidf(query, k=5):
    query_vec = vectorizer.transform([query])
    score = cosine_similarity(query_vec, embs).flatten()

    related_docs_indices = score.argsort()[-k:][::-1]
    
    results = []
    for i in related_docs_indices:
        results.append({
            'score': score[i],
            'id': i,
            'snippet': texts[i][:100] + '…'
        })
    return results

queries = [
    "A heartfelt drama about family relationships",
    "Sci-fi movie with space travel and aliens",
    "Hilarious comedy with witty dialogues",
    "Action film with car chases and explosions",
    "Critique of poor acting and weak plot"
]

print("\n=== ПОИСК С ИСПОЛЬЗОВАНИЕМ TF-IDF ===")
for q in queries:
    print(f"\n=== QUERY: {q} ===")
    for i, r in enumerate(search_tfidf(q, k=5), 1):
        print(f"{i}. score={r['score']:.3f}   id={r['id']}  :: {r['snippet']}")
        
#Ну такое


=== ПОИСК С ИСПОЛЬЗОВАНИЕМ TF-IDF ===

=== QUERY: A heartfelt drama about family relationships ===
1. score=0.171   id=8  :: call it a road trip for the walking wounded. stellan skarsg? rd plays such a convincingly zombified …
2. score=0.103   id=6  :: so ask yourself what " 8mm " (" eight millimeter ") is really all about. is it about a wholesome sur…
3. score=0.068   id=40  :: lengthy and lousy are two words to describe the boring drama the english patient. great acting, musi…
4. score=0.062   id=71  :: there's a 1, 000-foot tidal wave at the end of deep impact. i'd say it pretty accurately represents …
5. score=0.059   id=90  :: various films seen at the seattle film festival: it's all true/three men on a raft (u. s. a., 1942/1…

=== QUERY: Sci-fi movie with space travel and aliens ===
1. score=0.227   id=93  :: numerous comparisons can be made with this movie to past sci-fi, suspense thrillers. soldier is a mu…
2. score=0.208   id=62  :: there are two things the american film indu

# BERT

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2") 
EMB_DIM = 384

DDL = f"""
DROP TABLE IF EXISTS docs;
CREATE TABLE docs (
  id   BIGSERIAL PRIMARY KEY,
  text TEXT NOT NULL,
  emb  VECTOR({EMB_DIM}) NOT NULL
);
"""
with connect(host="localhost", port=PG_PORT, user=PG_USER,
             password=PG_PASS, dbname=PG_DB) as con:
    with con.cursor() as cur:
        cur.execute(DDL)
        con.commit()

def encode_texts(batch):
    X = model.encode(batch, batch_size=64, show_progress_bar=False,
                     normalize_embeddings=True)
    return X.astype(np.float32)

embs = []
B = 10
for i in tqdm(range(0, len(texts), B)):
    embs.append(encode_texts(texts[i:i+B]))
embs = np.vstack(embs)
embs.shape

rows = [(texts[i], embs[i].tolist()) for i in range(len(texts))]
with connect(host="localhost", port=PG_PORT, user=PG_USER,
             password=PG_PASS, dbname=PG_DB) as con:
    with con.cursor() as cur:
        cur.executemany(
            "INSERT INTO docs (text, emb) VALUES (%s, %s)",
            rows
        )
        con.commit()

100%|██████████| 10/10 [00:02<00:00,  4.05it/s]


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

def search_bert(query, k=5):
    query_vec = model.encode([query], batch_size=12, show_progress_bar=False,
                     normalize_embeddings=True)
    #print(query)
    #print(query_vec.shape)
    score = cosine_similarity(query_vec, embs).flatten()

    related_docs_indices = score.argsort()[-k:][::-1]
    
    results = []
    for i in related_docs_indices:
        results.append({
            'score': score[i],
            'id': i,
            'snippet': texts[i][:100] + '…'
        })
    return results

queries = [
    "A heartfelt drama about family relationships",
    "Sci-fi movie with space travel and aliens",
    "Hilarious comedy with witty dialogues",
    "Action film with car chases and explosions",
    "Critique of poor acting and weak plot"
]

print("\n=== ПОИСК С ИСПОЛЬЗОВАНИЕМ BERT ===")
for q in queries:
    print(f"\n=== QUERY: {q} ===")
    for i, r in enumerate(search_bert(q, k=5), 1):
        print(f"{i}. score={r['score']:.3f}   id={r['id']}  :: {r['snippet']}")
        
    search_bert(q, k=5), 1
    
#Ну получше


=== ПОИСК С ИСПОЛЬЗОВАНИЕМ BERT ===

=== QUERY: A heartfelt drama about family relationships ===
1. score=0.358   id=63  :: would you believe -- in real life, i mean -- that if you were julia roberts, that you'd be the ugly …
2. score=0.340   id=4  :: synopsis: a mentally unstable man undergoing psychotherapy saves a boy from a potentially fatal acci…
3. score=0.334   id=0  :: plot: two teen couples go to a church party, drink and then drive. they get into an accident. one of…
4. score=0.330   id=98  :: " love to kill " starts off aimlessly and gets progressively less coherent as time passes. at the ou…
5. score=0.328   id=45  :: when it comes to the average teenage romantic comedy, i expect negative reviews from critics left an…

=== QUERY: Sci-fi movie with space travel and aliens ===
1. score=0.460   id=17  :: so what do you get when you mix together plot elements from various successful sci-fi films such as …
2. score=0.436   id=91  :: capsule: the weakest and least engaging of th