# Pipeline de prueba - Offline

![](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTyOB7wa_P8nM2II_U_PEYfU1VlJH8Yrb2CUQ&s)

In [54]:
import mlflow

import polars as pl

from qdrant_client import QdrantClient, models
from fastembed import TextEmbedding

# 0. Config

In [51]:
TextEmbedding.list_supported_models()

[{'model': 'BAAI/bge-base-en',
  'sources': {'hf': 'Qdrant/fast-bge-base-en',
   'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en.tar.gz',
   '_deprecated_tar_struct': True},
  'model_file': 'model_optimized.onnx',
  'description': 'Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: necessary, 2023 year.',
  'license': 'mit',
  'size_in_GB': 0.42,
  'additional_files': [],
  'dim': 768,
  'tasks': {}},
 {'model': 'BAAI/bge-base-en-v1.5',
  'sources': {'hf': 'qdrant/bge-base-en-v1.5-onnx-q',
   'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en-v1.5.tar.gz',
   '_deprecated_tar_struct': True},
  'model_file': 'model_optimized.onnx',
  'description': 'Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year.',
  'license': 'mit',
  'size_in_GB': 0.21,
  'additional_files': [],
  'dim': 768,
  'tasks': {}},
 {'model':

In [52]:
QUERY = "Space movies"
MODEL_NAME = "ltr-dsrpflix-prd"
EMBEDDINGS_MODEL="sentence-transformers/all-MiniLM-L6-v2"
COLLECTION_NAME = "imdb-plot-embeddings"

## 0.1 Qdrant

In [None]:
client = QdrantClient("http://localhost:6333")
client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=models.VectorParams(
        size=384,
        distance=models.Distance.COSINE
    )
)

In [33]:
movies_db = pl.read_parquet("data/complete_imdb_database.parquet")
vector_movies_db = movies_db.with_columns(
    [
       pl.concat_str(
        [
            pl.col("title"),
            pl.col("Plot"),
        ],
        separator=". ",
    ).alias("full_sentence"),
    ]
).select([
    "imdb_id",
    "full_sentence"
])
vector_movies_db.head()

imdb_id,full_sentence
str,str
"""tt0002423""","""Passion. The story of Madame D‚Ä¶"
"""tt0004181""","""Judith of Bethulia. A fascinat‚Ä¶"
"""tt0004465""","""The Perils of Pauline. Young P‚Ä¶"
"""tt0003643""","""The Avenging Conscience: or 'T‚Ä¶"
"""tt0002844""","""Fant√¥mas: In the Shadow of the‚Ä¶"


In [46]:
points[0]

PointStruct(id='tt0002423', vector=Document(text='Passion. The story of Madame DuBarry, the mistress of Louis XV of France, and her loves in the time of the French revolution.', model='sentence-transformers/all-MiniLM-L6-v2', options=None), payload=None)

In [48]:
range(round(len(points) / BATCH))

range(0, 94)

In [None]:
points[:1]

In [None]:
points = []
for row in  vector_movies_db.iter_rows():
    try:
        point = models.PointStruct(
            id=row[0],
            vector=models.Document(text=row[1], model=TextEmbedding(EMBEDDINGS_MODEL))
        )
        points.append(point)
    except:
        continue

BATCH = 500
batch_counter = 0
for batch in range(round(len(points) / BATCH)):
    client.upsert(
        collection_name=COLLECTION_NAME, 
        points=points[batch_counter: batch_counter + BATCH +1]
    )
    batch_counter += BATCH

Fetching 5 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:05<00:00,  1.02s/it]


# 1. Retrieval Semantico

In [None]:
def search():
    pass

# 2. Re-Ranking modelo LTR

In [None]:
FEATURE_COLS =  [
    "sim_embedding",
    "imdb_rating",
    "imdb_votes_log",
]

LTR_DB = pl.read_parquet("data/ltr_imdb_dataset.parquet")
RETRIEVAL_MOVIES = LTR_DB.filter(pl.col("imdb_id").is_in(retrieved_ids)).select(FEATURE_COLS).to_numpy()

In [4]:
def rank():
    prod_model = mlflow.pyfunc.load_model(f"models:/{MODEL_NAME}@champion")
    scores = prod_model.predict(RETRIEVAL_MOVIES)
    LTR_DB.with_columns([
        scores.alias("predicted_Scores")
    ]).head(10)["imdb_id"]

    return ids

# 3. IMDB API

In [7]:
!curl -X 'GET' \
  'https://api.imdbapi.dev/titles/tt0116996' \
  -H 'accept: application/json'

{"id":"tt0116996", "type":"movie", "primaryTitle":"Mars Attacks!", "primaryImage":{"url":"https://m.media-amazon.com/images/M/MV5BZjJhNTIxZmQtZjQ3ZC00MTZkLWI2YTItY2Y0ZTdiMmNiNGIyXkEyXkFqcGc@._V1_.jpg", "width":800, "height":1209}, "startYear":1996, "runtimeSeconds":6360, "genres":["Comedy", "Sci-Fi"], "rating":{"aggregateRating":6.4, "voteCount":256068}, "metacritic":{"score":52, "reviewCount":19}, "plot":"Earth is invaded by Martians with unbeatable weapons and a cruel sense of humor.", "directors":[{"id":"nm0000318", "displayName":"Tim Burton", "primaryImage":{"url":"https://m.media-amazon.com/images/M/MV5BZmFhZTljMTgtMTVmMS00NWFhLWE2ZDEtYjM5YWYwZmI0NWI1XkEyXkFqcGc@._V1_.jpg", "width":1107, "height":1639}, "primaryProfessions":["miscellaneous", "producer", "writer"]}], "writers":[{"id":"nm0114083", "displayName":"Len Brown", "primaryImage":{"url":"https://m.media-amazon.com/images/M/MV5BNDMxYTYwNzItNjEyZS00ZmM2LWIwMGEtMmNiMGE1MDJjZTljXkEyXkFqcGc@._V1_.jpg", "width":1841, "height":190

In [14]:
import requests
from IPython.display import Image, display, Markdown

def show_movie_by_id(movie_id: str):
    """
    Fetch movie data from IMDb API and display title, plot and image
    in a Jupyter Notebook.
    """
    url = f"https://api.imdbapi.dev/titles/{movie_id}"

    response = requests.get(
        url,
        headers={"accept": "application/json"}
    )

    if response.status_code != 200:
        display(Markdown(f"‚ùå Error fetching movie `{movie_id}`"))
        return

    data = response.json()

    title = data.get("primaryTitle", "Unknown title")
    plot = data.get("plot", "No plot available")
    image_url = data.get("primaryImage", {}).get("url")

    # Display nicely in notebook
    display(Markdown(f"## üé¨ {title}"))
    display(Markdown(f"**Plot:** {plot}"))

    if image_url:
        display(Image(url=image_url, width=300))
    else:
        display(Markdown("_No image available_"))



In [15]:
show_movie_by_id(movie_id="tt0116996")

## üé¨ Mars Attacks!

**Plot:** Earth is invaded by Martians with unbeatable weapons and a cruel sense of humor.