In [1]:
# Install Groq SDK
!pip install groq

# Imports
from groq import Groq
import json


Collecting groq
  Downloading groq-0.31.1-py3-none-any.whl.metadata (16 kB)
Downloading groq-0.31.1-py3-none-any.whl (134 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/134.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.9/134.9 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.31.1


In [13]:
# 🔑 Securely enter Groq API key (hidden input)
from getpass import getpass
from groq import Groq

# Prompt user instead of hardcoding
GROQ_API_KEY = getpass("🔑 Enter your Groq API key: ")

# Initialize client
client = Groq(api_key=GROQ_API_KEY)

# Test connection
response = client.chat.completions.create(
    model="llama-3.1-8b-instant",  # fast summarization model
    messages=[{"role": "user", "content": "Hello Groq, are you working?"}],
    max_tokens=50
)

print("✅ Groq connected:", response.choices[0].message.content)


🔑 Enter your Groq API key: ··········
✅ Groq connected: I'm not Groq, I am an AI assistant.


In [24]:
class ConversationManager:
    """
    A utility class to manage running chat history between user and assistant.
    Supports:
      - Appending new messages
      - Tracking total conversation length
      - Automatic summarization triggers (by message count or char length)
      - Replacing long history with concise summaries
    """

    def __init__(self, summarize_every_k=3, max_chars=500, keep_recent_after_summary=2):
        """
        Initialize conversation manager.

        Args:
            summarize_every_k (int): Number of new messages before forcing a summary.
            max_chars (int): Maximum allowed character count before forcing a summary.
            keep_recent_after_summary (int): Number of most recent messages to preserve
                                             after summarization.
        """
        self.history = []                      # Stores all messages in order (list of dicts)
        self.msg_count_since_summary = 0       # Counter for messages since last summary
        self.summarize_every_k = summarize_every_k
        self.max_chars = max_chars
        self.keep_recent_after_summary = keep_recent_after_summary

    def append(self, role, content):
        """
        Add a new message to the conversation history.

        Args:
            role (str): "user" or "assistant"
            content (str): The text content of the message
        """
        self.history.append({"role": role, "content": content})
        self.msg_count_since_summary += 1      # Increment counter for summary checks

    def total_chars(self):
        """
        Calculate the total number of characters in the conversation history.

        Returns:
            int: Total characters across all stored messages.
        """
        return sum(len(m["content"]) for m in self.history)

    def needs_summary(self):
        """
        Check if summarization should be triggered.

        Returns:
            bool: True if:
                  - Number of messages since last summary >= summarize_every_k
                  OR
                  - Total character count > max_chars
        """
        return (self.msg_count_since_summary >= self.summarize_every_k) or \
               (self.total_chars() > self.max_chars)

    def apply_summary(self, summary_text):
        """
        Replace long conversation history with a concise summary + last few preserved messages.

        Args:
            summary_text (str): Summarized version of the conversation.
        """
        preserved = self.history[-self.keep_recent_after_summary:]  # Keep last N messages
        self.history = [{"role": "assistant", "content": f"SUMMARY: {summary_text}"}] + preserved
        self.msg_count_since_summary = 0  # Reset message counter

    def get_history(self):
        """
        Get the current conversation history.

        Returns:
            list: Conversation history as a list of message dicts.
        """
        return self.history


In [15]:
def summarize_conversation(history):
    """Use Groq LLM to summarize the conversation history"""
    conv_text = "\n".join([f"{m['role']}: {m['content']}" for m in history])

    response = client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[
            {"role": "system", "content": "You are a helpful summarization assistant."},
            {"role": "user", "content": f"Summarize this conversation:\n{conv_text}"}
        ],
        max_tokens=150
    )

    return response.choices[0].message.content


In [16]:
cm = ConversationManager(summarize_every_k=3, max_chars=500)

sample_messages = [
    "Hi, I need help booking a flight to New York.",
    "Can you also find hotels nearby?",
    "Oh, and I’ll need a rental car once I land.",
    "I prefer staying near Central Park."
]

for msg in sample_messages:
    cm.append("user", msg)
    print(f"User: {msg}")

    if cm.needs_summary():
        summary = summarize_conversation(cm.get_history())
        cm.apply_summary(summary)
        print("\n--- ✅ Summary Applied ---")
        print(json.dumps(cm.get_history(), indent=2))


User: Hi, I need help booking a flight to New York.
User: Can you also find hotels nearby?
User: Oh, and I’ll need a rental car once I land.

--- ✅ Summary Applied ---
[
  {
    "role": "assistant",
    "content": "SUMMARY: You need assistance with booking a flight to New York, and also require help finding nearby hotels and arranging a rental car upon arrival."
  },
  {
    "role": "user",
    "content": "Can you also find hotels nearby?"
  },
  {
    "role": "user",
    "content": "Oh, and I\u2019ll need a rental car once I land."
  }
]
User: I prefer staying near Central Park.


In [17]:
!pip install jsonschema




In [18]:
import json
import re
from jsonschema import validate, ValidationError, FormatChecker

# JSON schema for extracted user info
user_info_schema = {
    "title": "UserInfo",
    "type": "object",
    "properties": {
        "name": {"type": ["string", "null"], "description": "Full name of the user."},
        "email": {
            "type": ["string", "null"],
            "format": "email",
            "description": "Email address of the user."
        },
        "phone": {
            "type": ["string", "null"],
            # allows digits, +, -, spaces, parens
            "pattern": r"^[0-9\-\+\s\(\)]{7,25}$",
            "description": "Phone number of the user."
        },
        "location": {"type": ["string", "null"], "description": "City or location of the user."},
        "age": {
            "type": ["integer", "null"],
            "minimum": 0,
            "description": "Age of the user in numeric form only (e.g., 24)."
        }
    },
    "required": ["name", "email", "phone", "location", "age"]
}


# 🔧 Cleaning function to normalize extracted JSON before validation
def clean_user_info(extracted: dict) -> dict:
    age_val = extracted.get("age")

    if isinstance(age_val, str):
        # Extract digits from string like "24 years old" → 24
        digits = re.findall(r"\d+", age_val)
        extracted["age"] = int(digits[0]) if digits else None

    elif isinstance(age_val, (int, float)):
        extracted["age"] = int(age_val)

    else:
        extracted["age"] = None

    return extracted


In [19]:
def conversation_to_text(history):
    """Build a human-readable conversation string from history (list of {role,content})."""
    return "\n".join([f"{m['role'].upper()}: {m['content']}" for m in history])


In [20]:
from jsonschema import FormatChecker

def extract_user_info_from_text(text, client, schema=user_info_schema, max_retries=1):
    """
    Extract structured user info JSON from `text` using the Groq client.
    Returns a dict conforming to `schema` (keys present; values may be None).
    """

    # System / user instructions: strict JSON only, null for missing fields
    system_prompt = (
        "You are a strict JSON extractor. Given a conversation, return a JSON object "
        "with exactly these keys: name, email, phone, location, age. "
        "If a value is missing, put null. Do NOT add any extra keys or commentary. "
        "Ensure JSON is valid and follows the schema: "
        "name (string), email (email string or null), phone (string or null), "
        "location (string or null), age (integer or null)."
    )

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Extract user info from this conversation:\n\n{text}"}
    ]

    # Use function-style schema to bias structured output (OpenAI-compatible call)
    functions = [
        {
            "name": "extract_user_info",
            "description": "Return a JSON object matching the schema",
            "parameters": schema
        }
    ]

    # Single attempt + optional retry
    for attempt in range(max_retries + 1):
        response = client.chat.completions.create(
            model="llama-3.1-8b-instant",
            messages=messages,
            functions=functions,
            function_call={"name": "extract_user_info"},
            max_tokens=300
        )

        # Grab the returned message
        choice = response.choices[0]
        message = choice.message

        # Preferred: function_call.arguments (stringified JSON)
        raw_json = None
        # try multiple access patterns for compatibility
        try:
            # many SDKs provide .message.function_call.arguments
            raw_json = message.function_call.arguments
        except Exception:
            # fallback: some SDKs give dict-like .message.get(...)
            try:
                raw_json = message.get("function_call", {}).get("arguments")
            except Exception:
                raw_json = None

        # second fallback: message content
        if not raw_json:
            try:
                raw_json = message.content
            except Exception:
                try:
                    raw_json = message.get("content", "")
                except Exception:
                    raw_json = ""

        # Try parse JSON
        parsed = None
        try:
            parsed = json.loads(raw_json)
        except Exception as e:
            # If parsing failed, attempt to extract a JSON substring (best-effort)
            m = re.search(r"(\{.*\})", raw_json, flags=re.DOTALL)
            if m:
                try:
                    parsed = json.loads(m.group(1))
                except Exception:
                    parsed = None

        if parsed is None:
            # If we couldn't parse JSON, ask model (next loop iteration) to produce ONLY JSON.
            if attempt < max_retries:
                messages.append({"role": "assistant", "content": "Invalid or unparsable JSON received. Please return ONLY the JSON object matching the schema."})
                continue
            else:
                raise ValueError(f"Could not parse JSON from model output. Raw output:\n{raw_json}")

        # Mild coercion to handle common model quirks (age as string, phone with extra chars)
        def coerce_types(d):
            out = {}
            for k in ["name", "email", "phone", "location", "age"]:
                v = d.get(k, None)
                if v is None:
                    out[k] = None
                    continue
                # Trim whitespace
                if isinstance(v, str):
                    v = v.strip()
                    # empty strings -> null
                    if v == "":
                        out[k] = None
                        continue
                # age: try to coerce numeric strings
                if k == "age" and v is not None:
                    if isinstance(v, str):
                        if v.isdigit():
                            out[k] = int(v)
                        else:
                            # try to extract digits
                            ds = re.search(r"(\d+)", v)
                            out[k] = int(ds.group(1)) if ds else None
                    elif isinstance(v, (int, float)):
                        out[k] = int(v)
                    else:
                        out[k] = None
                elif k == "phone" and v is not None:
                    # keep only digits and plus sign; keep short formatting
                    s = re.sub(r"[^\d\+\(\)\-\s]", "", str(v))
                    out[k] = s if s != "" else None
                else:
                    out[k] = v
            return out

        parsed_coerced = coerce_types(parsed)

        # Validate against schema
        try:
            validate(instance=parsed_coerced, schema=schema, format_checker=FormatChecker())
            # If valid, ensure all keys exist (fill missing with None)
            for k in ["name", "email", "phone", "location", "age"]:
                parsed_coerced.setdefault(k, None)
            return parsed_coerced
        except ValidationError as ve:
            # If validation fails and retry is available, ask model to fix only the JSON
            if attempt < max_retries:
                # Add a short assistant prompt asking to fix JSON to match schema
                messages.append({
                    "role": "assistant",
                    "content": "The previous JSON failed validation. Please return a corrected JSON object that strictly follows the schema (name, email, phone, location, age). Use null for any missing values."
                })
                continue
            else:
                # Final failure: surface helpful debug info
                raise ValidationError(f"Validation failed after coercion. Error: {ve}\nParsed object (post-coercion): {parsed_coerced}")


In [21]:
sample_chats = [
    # full self-introductions
    "User: Hi, I'm Alice Johnson. My email is alice.j@example.com and my phone number is +91 98765 43210. I live in Mumbai and I'm 24 years old.",
    # slightly informal and missing punctuation
    "User: Hello, I'm Rakesh. Email rakesh.mail@gmail.com Phone: 9123456789 Based in Hyderabad Age 30",
    # partial info (age missing), checks null handling
    "User: Hey, I'm Priya from Bangalore. Contact: priya_27@outlook.com, mobile 9988776655."
]

results = []
for i, chat in enumerate(sample_chats, start=1):
    print(f"\n--- Sample chat #{i} ---")
    print(chat)
    try:
        extracted = extract_user_info_from_text(chat, client=client, schema=user_info_schema, max_retries=1)
        print("Extracted JSON:")
        print(json.dumps(extracted, indent=2))
        results.append(extracted)
    except Exception as e:
        print("Extraction failed:", str(e))



--- Sample chat #1 ---
User: Hi, I'm Alice Johnson. My email is alice.j@example.com and my phone number is +91 98765 43210. I live in Mumbai and I'm 24 years old.
Extracted JSON:
{
  "name": "Alice Johnson",
  "email": "alice.j@example.com",
  "phone": "+91 98765 43210",
  "location": "Mumbai",
  "age": 24
}

--- Sample chat #2 ---
User: Hello, I'm Rakesh. Email rakesh.mail@gmail.com Phone: 9123456789 Based in Hyderabad Age 30
Extracted JSON:
{
  "name": "Rakesh",
  "email": "rakesh.mail@gmail.com",
  "phone": "9123456789",
  "location": "Hyderabad",
  "age": 30
}

--- Sample chat #3 ---
User: Hey, I'm Priya from Bangalore. Contact: priya_27@outlook.com, mobile 9988776655.
Extracted JSON:
{
  "name": "Priya",
  "email": "priya_27@outlook.com",
  "phone": "9988776655",
  "location": "Bangalore",
  "age": null
}


In [22]:

from jsonschema import validate, ValidationError

final_results = []

for extracted in results:
    # Step 1: Clean the extracted dictionary (normalize age, etc.)
    cleaned = clean_user_info(extracted)

    # Step 2: Validate against schema
    try:
        validate(instance=cleaned, schema=user_info_schema, format_checker=FormatChecker())
        print("\n✅ Validation passed for:", cleaned["name"])
    except ValidationError as ve:
        print("\n❌ Validation failed:", ve.message)

    # Step 3: Append cleaned result
    final_results.append(cleaned)

# Step 4: Display final structured results
print("\n--- Final Cleaned & Validated Results ---")
print(json.dumps(final_results, indent=2))



✅ Validation passed for: Alice Johnson

✅ Validation passed for: Rakesh

✅ Validation passed for: Priya

--- Final Cleaned & Validated Results ---
[
  {
    "name": "Alice Johnson",
    "email": "alice.j@example.com",
    "phone": "+91 98765 43210",
    "location": "Mumbai",
    "age": 24
  },
  {
    "name": "Rakesh",
    "email": "rakesh.mail@gmail.com",
    "phone": "9123456789",
    "location": "Hyderabad",
    "age": 30
  },
  {
    "name": "Priya",
    "email": "priya_27@outlook.com",
    "phone": "9988776655",
    "location": "Bangalore",
    "age": null
  }
]


In [23]:
import pandas as pd

# Convert final structured results into a DataFrame
df = pd.DataFrame(final_results)

print("\n--- Final Results as DataFrame ---")
print(df)

# Optional: save results to CSV for reuse
df.to_csv("extracted_users.csv", index=False)
print("\n✅ Results saved to extracted_users.csv")



--- Final Results as DataFrame ---
            name                  email            phone   location   age
0  Alice Johnson    alice.j@example.com  +91 98765 43210     Mumbai  24.0
1         Rakesh  rakesh.mail@gmail.com       9123456789  Hyderabad  30.0
2          Priya   priya_27@outlook.com       9988776655  Bangalore   NaN

✅ Results saved to extracted_users.csv
