In [117]:
from qdrant_client import QdrantClient, models
from dotenv import load_dotenv
import uuid
import json
import requests
from os import environ
import time
import re

load_dotenv()

True

In [24]:
# init qdrant
qd_client = QdrantClient(url="http://localhost:6666")

# read json file
with open('../dist/lotr_characters.json', 'r') as file:
    characters = json.load(file)

print(f"Loaded {len(characters)} entries.")

Loaded 749 entries.


In [84]:
collection_name = 'lotr-characters'
embedding_dimension = 512
jina_embedding_model = "jina-embeddings-v4"
jina_url = "https://api.jina.ai/v1/embeddings"
jina_api_key = environ.get('JINA_API_KEY')
indexing_task = "retrieval.passage"
querying_task = "retrieval.query"

In [85]:
def create_jina_embedding(input_text: str, task = indexing_task)-> list:
    """
    Create embedding using Jina API
    Returns a single embedding vector (list of floats)
    """
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {jina_api_key}",
    }
    data = {
        "input": [input_text],
        "model": jina_embedding_model,
        "dimensions": embedding_dimension,
        "task": task,
        "late_chunking": True,
    }
    try:
        res = requests.post(url=jina_url, headers=headers, json=data, timeout=30)
        if res.status_code == 200:
            embedding = res.json()["data"][0]["embedding"]
            return embedding
        else:
            raise Exception(f"Jina API error: {res.status_code} - {res.text}")
    except requests.RequestException as e:
        raise Exception(f"Request failed: {str(e)}")

In [86]:
def create_jina_embedding_batch(input_texts: list)-> list:
    """
    Create embeddings for multiple texts in a single API call (more efficient)
    Returns list of embedding vectors
    """
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {jina_api_key}",
    }
    data = {
        "input": input_texts,
        "model": jina_embedding_model,
        "dimensions": embedding_dimension,
        "task": indexing_task,
        "late_chunking": True,
    }
    try:
        res = requests.post(url=jina_url, headers=headers, json=data, timeout=60)
        if res.status_code == 200:
            # Extract all embeddings from the response
            embeddings = [d["embedding"] for d in res.json()["data"]]
            return embeddings
        else:
            raise Exception(f"Jina API error: {res.status_code} - {res.text}")
    except requests.RequestException as e:
        raise Exception(f"Request failed: {str(e)}")

In [118]:
def count_token_approximate(text: str)-> int:
    """
    Rough token estimation (1 token ≈ 4 characters for English text)
    """
    return len(text) // 4

In [125]:
def truncate_text_smart(text: str, max_tokens: int = 8000)-> str:
    """
    Intelligently truncate text while preserving meaning
    """
    if count_token_approximate(text=text) <= max_tokens:
        return text
    
    max_chars = max_tokens * 4

    # strategy 1: try to ut at sentence boundaries
    sentences = text.split('.')
    truncated = ""

    for sentence in sentences:
        test_text = truncated + sentence + "."
        if len(test_text) <= max_chars:
            truncated = test_text
        else:
            break
    
    # strategy 2: if no complete sentences fit, cut at word boundaries
    # if not truncated.strip():
    #     words = text.split()
    #     truncated = ""
    #     for word in words:
    #         test_text = truncated + " " + word if truncated else word
    #         if len(test_text) <= test_text:
    #             truncated = test_text
    #         else:
    #             break

    # strategy 3: hard cut if necessary
    # if not truncated.strip():
    #     truncated = text[:max_chars]

    return truncated.strip()

In [129]:
def create_character_text_safe(character: dict, max_tokens: int = 7000)-> str:
    """
    Create character text with length limits to avoid API errors
    """
    text_parts = []
    
    # Always include name (short)
    if character.get('name'):
        text_parts.append(f"Name: {character['name']}")

    # add other basic info
    basic_fields = ['race', 'gender', 'realm', 'culture', 'birth', 'death', 'spouse', 'hair', 'height']
    for field in basic_fields:
        if character.get(field):
            text_parts.append(f"{field.title()}: {character[field]}")

    # handle biography and history with truncation
    biography = character.get('biography')
    history = character.get('history')

    # calculate remaining token budget for content
    basic_text = " | ".join(text_parts)
    basic_tokens = count_token_approximate(basic_text)
    remaining_tokens = max_tokens - basic_tokens - 100

    content = None
    content_label = ""

    if biography and biography.strip():
        content = biography.strip()
        content_label = "Biography"
    elif history and history.strip():
        content = history.strip()
        content_label = "History"
    if content and remaining_tokens > 100:
        # truncate content if necessary
        content = truncate_text_smart(content, remaining_tokens)
        text_parts.append(f"{content_label}: {content}")
    elif not content:
        # Add generic description if no content
        generic_desc = f"Character from Middle-earth"
        if character.get('race'):
            generic_desc = f"{character['race']} from Middle-earth"
        if character.get('realm'):
            generic_desc += f" of {character['realm']}"
        text_parts.append(generic_desc)

    final_text = " | ".join(text_parts)

    # final safety check
    if count_token_approximate(final_text) > max_tokens:
        final_text = truncate_text_smart(final_text, max_tokens)
    
    return final_text

In [87]:
def create_character_text_comprehensive(character: dict[str | None]) -> str:
    sections = []
    
    # Basic info section
    basic_info = []
    for field in ['name', 'race', 'gender', 'realm', 'culture', 'birth', 'death', 'spouse', 'hair', 'height']:
        if character.get(field):
            basic_info.append(f"{field.title()}: {character[field]}")
    
    if basic_info:
        sections.append(" | ".join(basic_info))
    
    #content section (biography or history)
    content = None
    if character.get('biography') and character.get('biography').strip():
        content = character['biography'].strip()
    elif character.get('history') and character.get('history').strip():
        content = character['history'].strip()

    if content:
        sections.append(content)
    
    return "\n\n".join(sections)

In [133]:
def create_character_summary(character: dict, max_length: int = 500)-> str:
    """
    Create a concise summary for characters with very long descriptions
    """
    name = character.get('name', 'Unknown')
    summary_parts = [name]

    # add key identifiers
    if character.get('race'):
        summary_parts.append(f"a {character['race']}")
    if character.get('realm'):
        summary_parts.append(f"from {character['realm']}")    
    if character.get('culture'):
        summary_parts.append(f"of {character['culture']} culture")
    
    # extract first few sentences from biography/history
    content: str = character.get('biography') or character.get('history')
    if content and content.strip():
        # get first 2-3 sentences
        sentences = content.strip().split('.')[:3]
        first_sentences = '. '.join(sentences).strip()
        if first_sentences and not first_sentences.endswith('.'):
            first_sentences += '.'
        # truncate if still too long
        if len(first_sentences) > max_length:
            first_sentences = first_sentences[:max_length-3] + "..."
        
        summary_parts.append(first_sentences)
    
    return " - ".join(summary_parts)

In [135]:
def handle_long_text_error(character_texts: list, characters: list, max_retries: int = 2):
    """
    Handle cases where batch embedding fails due to long texts
    """
    print(f"Batch failed, trying individual processing for {len(character_texts)} items...")
    
    embeddings = []
    failed_indices = []

    for i, (text, character) in enumerate(zip(character_texts, characters)):
        retry_count = 0
        success = False
        current_text = text

        while retry_count < max_retries and not success:
            try:
                embedding = create_jina_embedding(current_text)
                embeddings.append(embedding)
                success = True

            except Exception as e:
                if "Internal server error during encoding" in str(e) or "token" in str(e).lower():
                    retry_count += 1
                    # try with progressively shorter text
                    if retry_count == 1:
                        # try with summary
                        current_text = create_character_summary(character=character, max_length=300)
                        print(f"  Retry {retry_count} for {character.get('name', 'Unknown')}: using summary")
                    elif retry_count == 2:
                        # try with just name and basic info
                        current_text = f"{character.get('name', 'Unknown')} - {character.get('race', 'Unknown race')} from {character.get('realm', 'Middle-earth')}"
                        print(f"  Retry {retry_count} for {character.get('name', 'Unknown')}: using minimal text")
                    else:
                        print(f"  Non-length error for {character.get('name', 'Unknown')}: {str(e)}")
                        break
        if not success:
            print(f"  Failed to embed {character.get('name', 'Unknown')} after {max_retries} retries")
            failed_indices.append(i)
            embeddings.append(None)  # Placeholder
    
    return embeddings, failed_indices


In [136]:
def create_jina_embedding_batch_safe(input_texts: list, max_token_per_text: int = 6000) -> list:
    """
    Create embeddings for multiple texts with length safety checks
    """
    # First, ensure all texts are within safe limits
    safe_texts = []
    for text in input_texts:
        safe_text = truncate_text_smart(text=text, max_tokens=max_token_per_text)
        safe_texts.append(safe_text)
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {jina_api_key}",
    }
    data = {
        "input": safe_texts,
        "model": jina_embedding_model,
        "dimensions": embedding_dimension,
        "task": indexing_task,
        "late_chunking": True,
    }

    try:
        res = requests.post(url=jina_url, headers=headers, json=data, timeout=120)
        if res.status_code == 200:
            embeddings = [d["embedding"] for d in res.json()["data"]]
            return embeddings
        else:
            raise Exception(f"Jina API error: {res.status_code} - {res.text}")
    except requests.RequestException as e:
        raise Exception(f"Request failed: {str(e)}")


In [66]:
create_character_text_comprehensive(character=characters[0])

'Name: Adanel | Race: Men | Gender: Female | Spouse: Belemir\n\nAdanel married Belemir of the House of Bëor; he was a great-grandson of Bëor the Old. Adanel was the mother of five children, and her fifth and last child was a son: Beren.[1] His daughter, Emeldir "the Man-hearted", named her son after her father; this child was the renowned Beren Erchamion (Beren the One-handed). Thus, Adanel was the great-grandmother of Beren, son of Emeldir, her granddaughter.\nThe Wise of the people of Marach were the only Men to preserve the tale of their original sin, when, soon after their awakening, the Men chose to worship Melkor instead of Eru. Adanel told this tale (called the Tale of Adanel in Morgoth\'s Ring) to Andreth of the House of Bëor.\nAndreth was a very distant niece of Adanel\'s husband Belemir through the line of Baran, the eldest son of Bëor, and Belemir through the line of his grandfather Belen, who was the youngest son of Bëor. In addition, Andreth\'s nephew Barahir married Adane

In [88]:
def analyze_character_data(characters):
    """
    Helper function to analyze your character data and understand null patterns
    """
    total = len(characters)
    has_biography = sum(1 for char in characters if char.get('biography') and char.get('biography').strip())
    has_history = sum(1 for char in characters if char.get('history') and char.get('history').strip())
    has_both_null = sum(1 for char in characters 
                       if not (char.get('biography') and char.get('biography').strip()) 
                       and not (char.get('history') and char.get('history').strip()))
    
    print(f"Data Analysis:")
    print(f"Total characters: {total}")
    print(f"Characters with biography: {has_biography} ({has_biography/total*100:.1f}%)")
    print(f"Characters with history: {has_history} ({has_history/total*100:.1f}%)")
    print(f"Characters with neither: {has_both_null} ({has_both_null/total*100:.1f}%)")

In [89]:
analyze_character_data(characters=characters)

Data Analysis:
Total characters: 749
Characters with biography: 682 (91.1%)
Characters with history: 19 (2.5%)
Characters with neither: 48 (6.4%)


In [90]:
def reinitiate_collection():
    is_collection_exist = qd_client.collection_exists(collection_name=collection_name)
    if is_collection_exist:
        qd_client.delete_collection(collection_name=collection_name)
        print(f"Deleted existing collection: {collection_name}")
    print(f"Collection {collection_name} didn't exist, creating new one")
    qd_client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=embedding_dimension, # Dimensionality of the vectors
            distance=models.Distance.COSINE # Distance metric for similarity search
        )
    )
    print("Created the new collection")

In [91]:
def upsert_to_qdrant():
    """
    Updated function to use Jina API for embeddings instead of FastEmbed
    """
    if not qd_client.collection_exists(collection_name=collection_name):
        print(f"Collection {collection_name} does not exist.")
        return
    

    points = []
    print("Creating embeddings and preparing points...")
    # Process one by one (simpler but slower)
    for i, character in enumerate(characters):
        try:
            # Create comprehensive text for embedding
            character_text = create_character_text_comprehensive(character)
            # Get embedding from Jina API
            embedding = create_jina_embedding(character_text)
            point = models.PointStruct(
                id=uuid.uuid4().hex(),
                vector=embedding,
                payload={
                    **character,
                    "embedded_text": character_text
                }
            )
            points.append(point)

            # Add small delay to avoid rate limiting
            if i > 0 and i % 10 == 0:
                time.sleep(0.1)
                print(f"Processed {i+1}/{len(characters)} characters...")
        except Exception as e:
            print(f"Error creating embedding for character '{character['name']}': {str(e)}")
            continue
    if not points:
        print("No valid points to upsert..")
        return
    print("Initialize upserting process...")
    try:
        qd_client.upsert(
            collection_name=collection_name,
            points=points
        )
    except Exception as e:
        print(f"Error during upsert: {str(e)}")

In [73]:
def upsert_to_qdrant_batch():
    """
    More efficient version using batch embedding creation
    """
    if not qd_client.collection_exists(collection_name=collection_name):
        print(f'Collection {collection_name} does not exist.')
        return
    
    print("Creating batch embeddings...")
    try:
        # Create comprehensive text for each character
        character_texts = []
        valid_characters = []

        for character in characters:
            try:
                character_text = create_character_text_comprehensive(character=character)
                character_texts.append(character_text)
                valid_characters.append(character)
            except Exception as e:
                print(f"Error reating text for character '{character.get('name', 'Unknown')}': {str(e)}")
        
        if not character_texts:
            print("No valid character texts created.")
            return
        
        print(f"Creating batch embeddings for {len(character_texts)} characters...")

        # Create all embeddings in one API call
        embeddings = create_jina_embedding_batch(character_texts)

        # Create points with embeddings
        points = []
        for character, character_text, embedding in zip(valid_characters, character_texts, embeddings):
            point = models.PointStruct(
                id=uuid.uuid4().hex,
                vector=embedding,
                payload={
                    **character,
                    'embedded_text': character_text
                }
            )
            points.append(point)
        
        print("Starting upsert process...")
        qd_client.upsert(
            collection_name=collection_name,
            points=points
        )
        print(f"Successfully upserted {len(points)} entries.")
    except Exception as e:
        print(f"Error during batch processing: {str(e)}")

In [105]:
def upsert_to_qdrant_chunked(chunk_size: int = 100):
    """
    Process large datasets in chunks to avoid API limits and memory issues
    """
    if not qd_client.collection_exists(collection_name=collection_name):
        print(f'Collection {collection_name} does not exist.')
        return
    
    total_characters = len(characters)
    total_processed = 0

    print(f"Processing {total_characters} characters in chunk of {chunk_size}...")

    for i in range(0, total_characters, chunk_size):
        chunk = characters[i:i + chunk_size]
        character_texts = [create_character_text_comprehensive(character) for character in chunk]

        try:
            print(f"Processing chunk {i//chunk_size + 1}/{(total_characters + chunk_size - 1)//chunk_size}...")

            # Create embeddings for this chunk
            embeddings = create_jina_embedding_batch(character_texts)

            # Create points
            points = []
            for character, character_text, embedding in zip(chunk, character_texts, embeddings):
                point = models.PointStruct(
                    id=uuid.uuid4().hex,
                    vector=embedding,
                    payload={
                        **character,
                        "embedded_text": character_text
                    }
                )
                points.append(point)
            
            # Upsert this chunk
            qd_client.upsert(
                collection_name=collection_name,
                points=points
            )

            total_processed += len(points)
            print(f"Processed {total_processed}/{total_characters} characters...")

            # Small delay between chunks to be nice to the API
            time.sleep(5)
        except Exception as e:
            print(f"Error processing chunk starting at index {i}: {str(e)}")
            continue
    print(f"Finished processing. Total upserted: {total_processed} entries...")

In [142]:
def upsert_to_qdrant_chunked_safe(chunk_size:int = 10, max_tokes_per_text:int = 6000):
    """
    Process large datasets in chunks with text length safety and error recovery
    """
    if not qd_client.collection_exists(collection_name=collection_name):
        print(f'Collection {collection_name} does not exist.')
        return
    
    # First, prepare all character texts with safety limits
    print("Preparing safe character texts...")
    prepared_data = []

    for character in characters:
        try:
            character_text = create_character_text_safe(character=character, max_tokens=max_tokes_per_text)
            token_count = count_token_approximate(character_text)

            prepared_data.append({
                "character": character,
                "text": character_text,
                "token_count": token_count
            })
        except Exception as e:
            print(f"Error preparing text for {character.get('name', 'Unknown')}: {str(e)}")
            continue
    
    if not prepared_data:
        print("No valid character data to process")
        return
    
    print(f"Prepared {len(prepared_data)} characters for processing")

    # sort by token count to process shorter texts first (more likely to succeed)
    prepared_data.sort(key=lambda x: x['token_count'])

    total_characters = len(prepared_data)
    total_processed = 0

    total_chunks = (total_characters + chunk_size - 1) // chunk_size
    print(f"Processing {total_characters} characters in {total_chunks} chunks of {chunk_size}...")
    
    for chunk_idx in range(0, total_characters, chunk_size):
        chunk_data = prepared_data[chunk_idx:chunk_idx + chunk_size]
        chunk_num = chunk_idx // chunk_size + 1

        print(f"\nProcessing chunk {chunk_num}/{total_chunks} ({len(chunk_data)} characters)...")
        # Extract texts and characters for this chunk
        chunk_texts = [item["text"] for item in chunk_data]
        chunk_characters = [item["character"] for item in chunk_data]
        chunk_token_counts = [item["token_count"] for item in chunk_data]

        print(f" Token counts in chunk: {chunk_token_counts}")
        print(f" Max tokens in chunk: {max(chunk_token_counts)}")

        # try to create embeddings for this chunk
        embeddings = None
        try:
            embeddings = create_jina_embedding_batch_safe(chunk_texts, max_token_per_text=max_tokes_per_text)
            print(f"Batch embedding successful for chunk {chunk_num}")
        except Exception as batch_error:
            print(f"Batch embedding failed for chunk {chunk_num}: {str(batch_error)}")

            #fall back to individual processing with progressive text shortening
            print("Falling back to individual processing...")
            embeddings = []

            for i, (text, character) in enumerate(zip(chunk_texts, chunk_characters)):
                embedding = None
                current_text = text

                # try to progressively shorter versions
                for attempt in range(3):
                    try:
                        if attempt == 0:
                            # first attempt: original safe text
                            current_text = text
                        elif attempt == 1:
                            # second attempt: use summary
                            current_text = create_character_summary(character=character, max_length=400)
                            print(f"Retry {attempt+1} for {character.get('name', 'Unknown')}: using summary")
                        else:
                            # final attempt: minimal text
                            current_text = f"{character.get('name', 'Unknown')} - {character.get('race', 'Unknown')} from {character.get('realm', 'Middle-earth')}"
                        embedding = create_jina_embedding(input_text=current_text)
                        break # success, exit retry loop
                    except Exception as individual_error:
                        if attempt == 2:
                            # last attempt failed
                            print(f"all attempts failed for {character.get('name', 'Unknown')}: {str(individual_error)}")
                embeddings.append(embedding)
        
        # create points for successful embeddings
        chunk_points = []
        successful_embeddings = 0

        for character, text, embedding in zip(chunk_characters, chunk_texts, embeddings):
            if embedding is not None:
                point = models.PointStruct(
                    id=uuid.uuid4().hex,
                    vector=embedding,
                    payload={
                        **character,
                        'embedded_text': text,
                        'token_count': count_token_approximate(text)
                    }
                )
                chunk_points.append(point)
                successful_embeddings += 1

        # upsert this chunk if we have valid points
        if chunk_points:
            try:
                qd_client.upsert(
                    collection_name=collection_name,
                    points=chunk_points
                )
                total_processed += len(chunk_points)
                print(f"upserted {len(chunk_points)} points from chunk {chunk_num}...")
            except Exception as upsert_error:
                print(f"upsert failed for chunk {chunk_num}: {str(upsert_error)}")
        else:
            print(f"no valid points to upsert for chunk {chunk_num}")
        
        print(f"progress: {total_processed}/{total_characters} characters processed")

        # delay between chunks to be respectful to the API
        if chunk_num < total_chunks:
            print(f"waiting 2 seconds before next chunk...")
            time.sleep(2)
    
    print(f"finished processing. total upserted {total_processed}/{total_characters} entries.")

    if total_processed < total_characters:
        failed_count = total_characters - total_processed
        print(f"{failed_count} entries failed to process...")

In [143]:
reinitiate_collection()

Deleted existing collection: lotr-characters
Collection lotr-characters didn't exist, creating new one
Created the new collection


In [144]:
upsert_to_qdrant_chunked_safe()

Preparing safe character texts...
Prepared 749 characters for processing
Processing 749 characters in 75 chunks of 10...

Processing chunk 1/75 (10 characters)...
 Token counts in chunk: [10, 11, 11, 12, 12, 12, 12, 12, 12, 12]
 Max tokens in chunk: 12
Batch embedding successful for chunk 1
upserted 10 points from chunk 1...
progress: 10/749 characters processed
waiting 2 seconds before next chunk...

Processing chunk 2/75 (10 characters)...
 Token counts in chunk: [12, 12, 12, 12, 12, 13, 13, 13, 14, 14]
 Max tokens in chunk: 14
Batch embedding successful for chunk 2
upserted 10 points from chunk 2...
progress: 20/749 characters processed
waiting 2 seconds before next chunk...

Processing chunk 3/75 (10 characters)...
 Token counts in chunk: [15, 15, 19, 20, 20, 20, 22, 22, 24, 26]
 Max tokens in chunk: 26
Batch embedding successful for chunk 3
upserted 10 points from chunk 3...
progress: 30/749 characters processed
waiting 2 seconds before next chunk...

Processing chunk 4/75 (10 cha

In [151]:
def search(query: str, limit: int = 1):
    """
    Updated search function to use Jina API for query embedding
    """
    try:
        # Create embedding for the search query using Jina API
        query_embedding = create_jina_embedding(input_text=query, task=querying_task)
        
        query_points = qd_client.query_points(
            collection_name=collection_name,
            query=query_embedding,
            limit=limit,
            with_payload=True
        )
        results = [point.payload for point in query_points.points]

        return results
    except Exception as e:
        print(f"Error during search: {str(e)}")
        return None

In [None]:
def search_with_score_threshold(query: str, limit: int = 5, score_threshold: float = 0.7):
    """
    Enhanced search function with similarity score filtering
    """
    try:
        query_embedding = create_jina_embedding(input_text=query, task=querying_task)

        results = qd_client.query_points(
            collection_name=collection_name,
            query=query_embedding,
            limit=limit,
            with_payload=True,
            score_threshold=score_threshold
        )
        return results
    except Exception as e:
        print(f"Error during search with threshold: {str(e)}")
        return None

In [152]:
query_results = search(query='who destroyed the one ring?', limit=3)
query_results

[{'birth': 'YT',
  'death': 'SA 1697 ,War of the Elves and Sauron',
  'gender': 'Male',
  'hair': 'Dark',
  'height': None,
  'name': 'Celebrimbor',
  'race': 'Elves',
  'realm': 'Eregion',
  'spouse': None,
  'biography': 'First Age\nCelebrimbor as a child with his parents in Valinor, by Marya Filatova\nCelebrimbor was the son of Curufin, who was the fifth son of Fëanor (son of Finwë and his first wife Míriel) and Nerdanel.[3] In the First Age, he fought in the battles of Kinslaying at Alqualondë, Dagor-nuin-Giliath and Dagor Aglareb. After fighting in the Siege of Angband, Dagor Bragollach, Battle of the Pass of Aglond and Fall of Tol Sirion, he lived with his father and uncle in Nargothrond, staying even after Curufin\'s exile. After fighting in Nirnaeth Arnoediad, he fought in Sack of Nargothrond, and then escaped and traveled to Gondolin. He fought and survived the Fall of Gondolin and Third Kinslaying. Celebrimbor finally fought in the War of Wrath.  Unlike some of his kin, he re

In [103]:
chunk = characters[0:10]
character_text = [create_character_text_comprehensive(character) for character in chunk]
embeddings = create_jina_embedding_batch(character_text)

In [104]:
len(embeddings)

10

In [134]:
chunk = characters[40:46]
# truncated_txt = truncate_text_smart(text=chunk[0]['biography'])
# len(truncated_txt)
# truncated_txt
character_text = create_character_text_safe(chunk[0])
character_summary = create_character_summary(chunk[0])
character_summary

"Aragorn II Elessar - a Men - from Reunited Kingdom,Arnor,Gondor - Early years\nAragorn as a toddler, portrayed by Luke Johnston in the fan film Born of Hope\nHe was Aragorn son of Arathorn, the nine and thirtieth heir in the right line from Isildur, and yet more like Elendil than any before him. —The Silmarillion, Of the Rings of Power and the Third Age\nAragorn was a descendant of Elros Tar-Minyatur through the line of the Lords of Andúnië to Elendil, High King of Arnor and Gondor.  Like all of the kings before him, he was Elrond's kin through the House of El..."

In [141]:
if 900 in range(0, len(characters), 5):
    print('pio')
else:
    print('no pio')

no pio
