In [15]:
import requests
import json
import os

# Your Hugging Face API token
API_TOKEN = os.getenv("HF_API_KEY")

MODEL_NAME = "Qwen/Qwen2.5-Coder-32B-Instruct" #"google/gemma-2-2b-it"

# The API endpoint for chat completion
API_URL = f"https://api-inference.huggingface.co/models/{MODEL_NAME}"

# Headers with your API token
headers = {
    "Authorization": f"Bearer {API_TOKEN}",
    "Content-Type": "application/json"
}

# Function to query the model
def query_model(prompt):
    payload = {
        "inputs": prompt,
        "parameters": {
            "max_length": 50,  # Adjust max length as needed
            "temperature": 0.7,  # Adjust temperature for creativity
            "top_p": 0.9  # Adjust top-p for nucleus sampling
        }
    }

    response = requests.post(API_URL, headers=headers, json=payload)

    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        print(response.text)
        return None

# Example usage
if __name__ == "__main__":
    DOCUMENT_PATH = "D:/Github/Speech-to-Text-Summarization/transcription.txt"
    with open(DOCUMENT_PATH, "r", encoding="utf-8") as f:
        document_text = f.read()
    # Your input prompt
    user_prompt = "Summarise following text: " + document_text

    # Query the model
    result = query_model(user_prompt)

    if result:
        # Print the model's response
        print("Model Response:")
        print(result[0]['generated_text'])

Error: 422
{"error":"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 32768. Given: 127967 `inputs` tokens and 0 `max_new_tokens`","error_type":"validation"}


## 2nd approach

In [19]:
import requests
import json
import os
from transformers import AutoTokenizer

# Your Hugging Face API token
API_TOKEN = os.getenv("HF_API_KEY")

MODEL_NAME = "Qwen/Qwen2.5-Coder-32B-Instruct"
API_URL = f"https://api-inference.huggingface.co/models/{MODEL_NAME}"

headers = {
    "Authorization": f"Bearer {API_TOKEN}",
    "Content-Type": "application/json"
}

def query_model(prompt, max_new_tokens=500):
    payload = {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens": max_new_tokens,  # Controls response length
            "temperature": 0.7,
            "top_p": 0.9
        }
    }

    response = requests.post(API_URL, headers=headers, json=payload)

    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        print(response.text)
        return None

def split_document(document_text, tokenizer, max_tokens):
    # Tokenize the document without special tokens
    tokens = tokenizer.encode(document_text, add_special_tokens=False)
    
    chunks = []
    current_chunk = []
    
    for token in tokens:
        current_chunk.append(token)
        if len(current_chunk) >= max_tokens:
            chunks.append(current_chunk)
            current_chunk = []
    
    if current_chunk:
        chunks.append(current_chunk)
    
    # Decode token chunks back to text
    return [tokenizer.decode(chunk) for chunk in chunks]

if __name__ == "__main__":
    DOCUMENT_PATH = "D:/Github/Speech-to-Text-Summarization/transcription.txt"
    with open(DOCUMENT_PATH, "r", encoding="utf-8") as f:
        document_text = f.read()

    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    # Calculate token usage for static prompt part
    static_prompt = "Summarise following text: "
    static_prompt_tokens = tokenizer.encode(static_prompt, add_special_tokens=False)
    
    # Set safety margin to account for model's internal formatting
    max_total_tokens = 32768
    max_response_tokens = 500
    available_tokens = max_total_tokens - len(static_prompt_tokens) - max_response_tokens
    
    # Split document into chunks that fit with the prompt
    document_chunks = split_document(document_text, tokenizer, available_tokens)
    
    summaries = []
    for i, chunk in enumerate(document_chunks):
        print(f"Processing chunk {i+1}/{len(document_chunks)}")
        
        # Create full prompt with your exact structure
        full_prompt = static_prompt + chunk
        
        # Verify token count
        prompt_tokens = tokenizer.encode(full_prompt, add_special_tokens=False)
        print(f"Token count: {len(prompt_tokens)} (max allowed: {max_total_tokens - max_response_tokens})")
        
        # Get summary for this chunk
        result = query_model(full_prompt, max_new_tokens=max_response_tokens)
        
        if result and isinstance(result, list) and 'generated_text' in result[0]:
            summaries.append(result[0]['generated_text'])
        else:
            summaries.append(f"[Summary failed for chunk {i+1}]")

    # Combine all summaries
    final_summary = "\n\n".join(summaries)
    
    # Save and display results
    with open("summary.txt", "w", encoding="utf-8") as f:
        f.write(final_summary)
    
    print("\nFinal Summary:")
    print(final_summary)

Token indices sequence length is longer than the specified maximum sequence length for this model (127961 > 32768). Running this sequence through the model will result in indexing errors


Processing chunk 1/4
Token count: 32267 (max allowed: 32268)
Processing chunk 2/4
Token count: 32268 (max allowed: 32268)
Processing chunk 3/4
Token count: 32268 (max allowed: 32268)
Processing chunk 4/4
Token count: 31184 (max allowed: 32268)

Final Summary:
Summarise following text: Bagi banyak kasih ya malah. Jadi nomor satu masih opsi anda tetangga pak. Sorry aku boleh izin rekam ya soalnya buat buat nonton takutnya kita ada lupa aja sih sebenarnya. Saya tunggu satu masih lsy. Nomor duanya setiap tiba tiba. Kemarin sih asia timur banyak Kazakhstan sekali promonya india. Oh iya oh iya kemarin india jadi india nih lagi ramai banget di Bali. Iya sih kamu kemarin nilai biasa kita memang sama nih om ayamnya juga kan idealnya ini dia ayamnya juga kita ayamnya india ini india lagi. Amin banget datang. Kerja juga enggak nulisnya. Kita lihat data tourism nih. 3 baru china cimbo. Tempatnya. Hores selatan sama juki di situ. Jadi memang kalau bulenya ini lagi agak ini bu ya. Lagi agak menurun 