<a href="https://colab.research.google.com/github/bnehirartan/Google-Gemini-API-Rate-Limit-Context-Window-Limit-Management/blob/main/gemini-api-ratelimit-contextsize.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Required Libraries

In [None]:
  !pip install -q -U google-generativeai

In [None]:
import os
import textwrap
import google.generativeai as genai
from IPython.display import display
from IPython.display import Markdown
import time

In [None]:
def to_markdown(text): # function converts plain text from the LLM model to Markdown format, adding blockquote styling and converting bullet points.
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

# Use API Key and Generative AI Models

In [None]:
from google.colab import userdata
GOOGLE_API_KEY=userdata.get('GEMINIAPI') # using the API key
genai.configure(api_key=GOOGLE_API_KEY)

In [None]:
 model=genai.GenerativeModel('gemini-1.5-pro-latest') # initialize the generative model with the latest Gemini-1.5-Pro version

# Token Counting Methods

In [None]:
def count_tokens(text):
    """
    Counts the number of tokens in a given text using the Gemini 1.5 Pro model.

    Parameters:
    text (str): The input text whose tokens need to be counted.

    Returns:
    int: The total number of tokens in the provided text.
    """
    response = model.count_tokens(text) # use the model's built-in token counting function to analyze the text
    return response.total_tokens

In [None]:
def get_token_usage(prompt, response_text):
    """
    Calculates the token usage for a given prompt and response.

    Parameters:
    prompt (str): The input text provided to the model.
    response_text (str): The generated response from the model.

    Returns:
    tuple: A tuple containing the number of input tokens, output tokens, and total tokens used.
    """
    input_tokens = count_tokens(prompt)
    output_tokens = count_tokens(response_text)
    total_tokens = input_tokens + output_tokens

    return input_tokens, output_tokens, total_tokens

# Handling API Rate Limit Method

In [None]:
# using global constants instead of passing them as arguments
MAX_TOKENS = 500
CONTEXT_WINDOW = 1000
WARNING_THRESHOLD = 0.8
#RPM = 2
#TPM = 32000

In [None]:
def api_request_with_retry(request_func, *args, **kwargs):
    """
    Makes an API request with automatic retry logic in case of failures.

    Parameters:
    request_func (function): The API request function to be executed.
    *args: Positional arguments to pass to the request function.
    **kwargs: Keyword arguments to pass to the request function.

    Returns:
    Any: The response from the API request if successful, otherwise None.
    """
    retries = 1
    max_retries = 3
    api_error_shown = False  # track if the error message was already printed (fixing the repeated API error message)*


    while retries <= max_retries:  # condition to allow 3 retries
        try:
            return request_func(*args, **kwargs)
        except Exception as e:
            error_message = str(e).lower()

            if "429" in error_message:  # check for rate limit error
                if not api_error_shown:
                    print(f"⚠ API Error Message: {e}")  # print only once*
                    api_error_shown = True

                wait_time = 2 ** retries  # exponential backoff logic
                print(f"⚠️ Rate limit exceeded! Waiting for {wait_time} seconds... ({retries}/{max_retries})")
                time.sleep(wait_time)
                retries += 1
            else:
                # if it's not a rate limit error, print a general error message and exit
                print("❌ Error: API request is failed.")
                return None
    # if we reach this point, all retries have failed
    print("❌ Maximum number of retries reached. API request failed.")
    return None


# Handling Context Window - Text Generation Method

In [None]:
def generate_text(prompt):
    """
    Generates text based on the given prompt while handling token limits and ensuring sentence completeness.

    Parameters:
    prompt (str): The input text that serves as the basis for text generation.

    Returns:
    str or None: The generated text if successful, otherwise None.
    """

    token_count = count_tokens(prompt)
    remaining_tokens = CONTEXT_WINDOW - token_count #calculates how many tokens are left before hitting the context window limit

    if token_count >= CONTEXT_WINDOW: # checking the number of token vs context window size
        print("⚠️ Warning: Prompt exceeds context window limit!")
        return None

    if token_count + MAX_TOKENS > CONTEXT_WINDOW * WARNING_THRESHOLD: # checking the warning THRESHOLD value
        print("⚠️ Warning: Token usage is close to the limit! Consider shortening input.")

    try:
        response = api_request_with_retry(model.generate_content, prompt, generation_config={"max_output_tokens": min(MAX_TOKENS, remaining_tokens)})
        if response:
            text = response.text.strip() #remove the unnecessary whitespaces in the text (at the start and the end)

            #STOP CHECK
            if text[-1] not in [".", "!", "?"]:  # check if the last character is a sentence-ending punctuation
                #print("Sentence cut off, requesting continuation...") #the response was cut-off mid-sentence due to token

                continuation = api_request_with_retry(   # requesting a continuation to complete the last sentence
                    model.generate_content,
                    "Continue from: " + text[-50:],  # take the last 50 characters to ensure continuity
                    generation_config={"max_output_tokens": min(MAX_TOKENS, remaining_tokens)}
                )
                if continuation:
                    text += " " + continuation.text.strip()  # if continuation is received, append it to the response

            return text
    except Exception as e:
      # handle any unexpected error that occur during text generation
        print(f"❌ Error generating text: {e}")
    return None #indicate failure

# Handling Context Window - Chat Mode Method

In [None]:

def chat_mode():
    """
    Initiates an interactive chat session where user inputs are processed, and AI-generated responses
    are displayed while maintaining chat history.

    The function ensures token limits are not exceeded and manages chat history accordingly.

    Returns:
    None
    """
    chat_history=[]

    while True:
        user_input = input("You: ")
        if user_input.lower() in ["exit", "quit"]:
            print("Chat ended.")
            break

        #add the user to the chat history
        chat_history.append(user_input)
        #combine the chat history into single string
        context = "\n".join(chat_history)
        #count the number of tokens used in the context
        input_tokens = count_tokens(context)
        #calculate remaining tokens available
        remaining_tokens = CONTEXT_WINDOW - input_tokens

        #check if the context window limit is exceeded
        if input_tokens >= CONTEXT_WINDOW:
            print("⚠️ Warning: Context window limit exceeded! Consider clearing history.")
            print("🔄 Clearing the history...")
            #chat_history = []  # clear the history to continue chatting when the limit is exceeded
            chat_history.pop(0)
            context = "\n".join(chat_history)
            continue  # skip the current iteration and start fresh

        if input_tokens + MAX_TOKENS > CONTEXT_WINDOW * WARNING_THRESHOLD:
            print("⚠️ Warning: Token usage is close to the limit!")

        try:
            # generate a response based on the current chat context
            response = api_request_with_retry(
                model.generate_content,
                context,
                generation_config={"max_output_tokens": min(MAX_TOKENS, remaining_tokens)}
            )

            # if a response is successfully generated
            if response:
                response_text = response.text.strip()

                #STOP CHECK
                if response_text[-1] not in [".", "!", "?"]:
                    # print("Chat response cut off, requesting continuation...")

                    continuation = api_request_with_retry(
                        model.generate_content,
                        "Continue from: " + response_text[-50:],  # maintain context with the last 50 characters
                        generation_config={"max_output_tokens": min(MAX_TOKENS, remaining_tokens)}
                    )


                    if continuation:
                        response_text += " " + continuation.text.strip()

                # add AI response to chat history
                chat_history.append(response_text)

                # calculate token usage for the input and response
                input_tokens, output_tokens, total_tokens = get_token_usage(context, response_text)

                # display token usage statistics
                print(f"📌 Input Tokens: {input_tokens}, Output Tokens: {output_tokens}, Total Tokens: {total_tokens}")

                # print AI-generated response
                print(f"AI: {response_text}")

        except Exception as e:
            # handle any unexpected error that occur during chat mode
            print(f"❌ Error generating response: {e}")
            continue  # continue the chat loop despite the error

In [None]:
chat_mode()

You: hello
📌 Input Tokens: 1, Output Tokens: 10, Total Tokens: 11
AI: Hello there! How can I help you today?
You: can you give me an information about API
📌 Input Tokens: 21, Output Tokens: 803, Total Tokens: 824
AI: API stands for **Application Programming Interface**.  Think of it as a messenger that allows different software systems to talk to each other and exchange information.  It defines how software components should interact, allowing developers to leverage functionalities of existing applications without needing to know the complex inner workings.

Here's a breakdown of key information about APIs:

**What APIs do:**

* **Enable communication:** APIs act as intermediaries, facilitating the transfer of data and requests between different software systems.  They provide a standardized way for applications to communicate, regardless of their underlying technology.
* **Expose functionality:** APIs allow developers to access specific features or data of an application without needi

In [None]:
chat_mode()

You: quit
Chat ended.


# API Request Testing (Text Generation)

In [None]:
for i in range(50):  # simulating a long conversation
    print(f"Test Message {i+1}")
    response = generate_text(f"This is message {i+1} in a long conversation.")

    if not response:
        print("❌ Test Failed: No Response Generated!")
        break

Test Message 1




⚠ API Error Message: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota).
⚠️ Rate limit exceeded! Waiting for 2 seconds... (1/3)




⚠️ Rate limit exceeded! Waiting for 4 seconds... (2/3)




⚠️ Rate limit exceeded! Waiting for 8 seconds... (3/3)
❌ Maximum number of retries reached. API request failed.
❌ Test Failed: No Response Generated!
