In [10]:
# Install if not already
!pip install google-generativeai pydantic --quiet



[0m

In [11]:
import os
import re
import ast
import json
from typing import List
from pydantic import BaseModel
from google import generativeai as genai

# Option 1: Set manually (RECOMMENDED for Jupyter)
os.environ["GEMINI_API_KEY"] = "AIzaSyDtEU5NR-019IdaTWDQznKNxlbUvRo7bUY"  # üîÅ Replace with your actual API key

# Configure Gemini
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))



In [12]:
class Permutation(BaseModel):
    name: str
    phone: str
    paytm_link: str
    country: str
    age: str
    email: str


In [14]:
def generate_permutations(user_data: dict, num: int) -> List[Permutation]:
    model = genai.GenerativeModel("gemini-2.0-flash")
    prompt = f"""
You are an expert data variation generator. Generate exactly {num} realistic permutations of this user data:

{json.dumps(user_data)}

Each permutation must:
- Vary the name (typos, initials, case, swapped, spacing, etc.)
- Vary at least one other field: phone, email, paytm_link, country, or age.

Output MUST be a JSON array, like:
[
  {{
    "name": "string",
    "phone": "string",
    "paytm_link": "string",
    "country": "string",
    "age": "string",
    "email": "string"
  }},
  ...
]
Use only double quotes and no extra text or markdown.
"""
    response = model.generate_content(prompt)
    raw = response.text  # ‚úÖ correct attribute

    raw = re.sub(r"^```json|```$", "", raw.strip(), flags=re.IGNORECASE).strip()
    obj = ast.literal_eval(raw)
    return [Permutation(**d) for d in obj]


In [17]:
input_user = {
  "name": "John Doe",
  "phone": "+1234567890",
  "paytm_link": "paytm://1234567890",
  "country": "USA",
  "age": "30",
  "email": "john.doe@example.com"
}

res = generate_permutations(input_user, num=5)
for i, p in enumerate(res, 1):
    print(f"\n‚úÖ Permutation {i}:\n{p.model_dump_json(indent=2)}")



‚úÖ Permutation 1:
{
  "name": "Jon Doe",
  "phone": "+1234567890",
  "paytm_link": "paytm://1234567890",
  "country": "USA",
  "age": "30",
  "email": "jon.doe@example.com"
}

‚úÖ Permutation 2:
{
  "name": "J. Doe",
  "phone": "+1987654321",
  "paytm_link": "paytm://1234567890",
  "country": "USA",
  "age": "30",
  "email": "john.doe@example.com"
}

‚úÖ Permutation 3:
{
  "name": "Doe, John",
  "phone": "+1234567890",
  "paytm_link": "paytm://1234567890",
  "country": "Canada",
  "age": "30",
  "email": "john.doe@example.com"
}

‚úÖ Permutation 4:
{
  "name": "John D",
  "phone": "+1234567890",
  "paytm_link": "paytm://1234567890",
  "country": "USA",
  "age": "35",
  "email": "john.doe@example.com"
}

‚úÖ Permutation 5:
{
  "name": "Johnn Doe",
  "phone": "+1234567890",
  "paytm_link": "paytm://0987654321",
  "country": "USA",
  "age": "30",
  "email": "john.doe@example.com"
}


In [27]:
import os
import re
import json
from typing import List
from pydantic import BaseModel
from google import generativeai as genai

# ==== Step 1: Configure Gemini API ====
# Optionally set key here or use dotenv
os.environ["GEMINI_API_KEY"] = "AIzaSyDtEU5NR-019IdaTWDQznKNxlbUvRo7bUY"  # Replace with your actual key
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

# ==== Step 2: Pydantic Model ====
class Permutation(BaseModel):
    name: str
    phone: str
    paytm_link: str
    country: str
    age: str
    email: str

# ==== Step 3: Generate Permutations ====
def generate_permutations(user_data: dict, num: int) -> List[Permutation]:
    model = genai.GenerativeModel("gemini-2.0-flash")

    prompt = f"""
You are an expert data variation generator. Generate exactly {num} realistic permutations of this user data:

{json.dumps(user_data)}

Each permutation must:
- Vary the name (typos, initials, case, swapped, spacing, etc.)
- Vary at least one other field: phone, email, paytm_link, country, or age.

Output MUST be a JSON array, like:
[
  {{
    "name": "string",
    "phone": "string",
    "paytm_link": "string",
    "country": "string",
    "age": "string",
    "email": "string"
  }},
  ...
]
Use only double quotes and no extra text or markdown.
"""

    response = model.generate_content(prompt)
    raw = response.text

    # Cleanup: remove markdown or backticks if present
#     raw = re.sub(r"```[\s\S]*?```", "", raw).strip()
    raw = re.sub(r"^```json|```$", "", raw.strip(), flags=re.IGNORECASE).strip()
    # Parse response
    try:
        json_data = json.loads(raw)
        permutations = [Permutation(**entry) for entry in json_data]
    except Exception as e:
        print("‚ùå Error parsing Gemini output:", e)
        print("Raw response:\n", raw)
        return []

    # ==== Step 4: Save to File ====
    output_path = "siriusAI/scrap_tools-main/scrap_tools-main/final/output/permuted_user_data.json"
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    with open(output_path, "w") as f:
        json.dump([p.model_dump() for p in permutations], f, indent=2)
        print(f"[‚úÖ] Saved {len(permutations)} permutations to: {output_path}")
        
    return permutations

# ==== Step 5: Example Run ====
if __name__ == "__main__":
    input_user = {
        "name": "John Doe",
        "phone": "+1234567890",
        "paytm_link": "paytm://1234567890",
        "country": "USA",
        "age": "30",
        "email": "john.doe@example.com"
    }

    res = generate_permutations(input_user, num=5)
    print("PRINT THE RES :")
    print(res)



[‚úÖ] Saved 5 permutations to: siriusAI/scrap_tools-main/scrap_tools-main/final/output/permuted_user_data1.json
PRINT THE RES :
[Permutation(name='Jon Doe', phone='+1234567890', paytm_link='paytm://1234567890', country='USA', age='30', email='jon.doe@example.com'), Permutation(name='J. Doe', phone='+1987654321', paytm_link='paytm://1234567890', country='USA', age='30', email='john.doe@example.com'), Permutation(name='Doe, John', phone='+1234567890', paytm_link='paytm://1234567890', country='Canada', age='30', email='john.doe@example.com'), Permutation(name='John D', phone='+1234567890', paytm_link='paytm://1234567890', country='USA', age='35', email='john.doe@example.com'), Permutation(name='Johnn Doe', phone='+1234567890', paytm_link='paytm://9876543210', country='USA', age='30', email='john.doe@example.com')]


In [31]:
import os
import re
import json
from typing import List
from pydantic import BaseModel
from google import generativeai as genai

# ==== Step 1: Configure Gemini API ====
os.environ["GEMINI_API_KEY"] = "AIzaSyDtEU5NR-019IdaTWDQznKNxlbUvRo7bUY"
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

# ==== Step 2: Pydantic Model ====
class Permutation(BaseModel):
    name: str
    phone: str
    paytm_link: str
    country: str
    age: str
    email: str

# ==== Step 3: Generate Permutations ====
def generate_permutations(user_data: dict, num: int) -> List[Permutation]:
    model = genai.GenerativeModel("gemini-2.0-flash")

    prompt = f"""
You are an expert data variation generator. Generate exactly {num} realistic permutations of this user data:

{json.dumps(user_data)}

Each permutation must:
- Vary the name (typos, initials, case, swapped, spacing, etc.)
- Vary at least one other field: phone, email, paytm_link, country, or age.

Output MUST be a JSON array, like:
[
  {{
    "name": "string",
    "phone": "string",
    "paytm_link": "string",
    "country": "string",
    "age": "string",
    "email": "string"
  }},
  ...
]
Use only double quotes and no extra text or markdown.
"""

    response = model.generate_content(prompt)
    raw = response.text

    raw = re.sub(r"^```json|```$", "", raw.strip(), flags=re.IGNORECASE).strip()

    try:
        json_data = json.loads(raw)
        permutations = [Permutation(**entry) for entry in json_data]
    except Exception as e:
        print("‚ùå Error parsing Gemini output:", e)
        print("Raw response:\n", raw)
        return []

    # ==== Step 4: Save to File ====
    output_path = "/workspace/siriusAI/scrap_tools-main/scrap_tools-main/final/output/permuted_user_data.json"

    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    with open(output_path, "w") as f:
        json.dump([p.model_dump() for p in permutations], f, indent=2)
        print(f"[‚úÖ] Saved {len(permutations)} permutations to: {output_path}")

    return permutations

# ==== Step 5: Example Run ====
if __name__ == "__main__":
    input_user = {
        "name": "John Doe",
        "phone": "+1234567890",
        "paytm_link": "paytm://1234567890",
        "country": "USA",
        "age": "30",
        "email": "john.doe@example.com"
    }

    res = generate_permutations(input_user, num=5)
    print("PRINT THE RES :")
    print(res)


[‚úÖ] Saved 5 permutations to: /workspace/siriusAI/scrap_tools-main/scrap_tools-main/final/output/permuted_user_data1.json
PRINT THE RES :
[Permutation(name='Jon Doe', phone='+1234567890', paytm_link='paytm://1234567890', country='USA', age='31', email='john.doe@example.com'), Permutation(name='J. Doe', phone='+1987654321', paytm_link='paytm://1234567890', country='USA', age='30', email='john.doe@example.com'), Permutation(name='Doe, John', phone='+1234567890', paytm_link='paytm://1234567890', country='Canada', age='30', email='john.doe@example.com'), Permutation(name='John D', phone='+1234567890', paytm_link='paytm://1234567890', country='USA', age='30', email='johndoe@example.com'), Permutation(name='Johnn Doe', phone='+1234567890', paytm_link='paytm://0987654321', country='USA', age='30', email='john.doe@example.com')]


In [32]:
import os
import json
from pathlib import Path
from serpapi import GoogleSearch
from dotenv import load_dotenv

# Absolute path to your .env file
env_path = Path("/workspace/siriusAI/temp1/project-root/src/latest_ai_development/.env")

# Load environment variables
load_dotenv(dotenv_path=env_path)

# Assign SERP API key
SERP_API_KEY = os.getenv("SERP_API_KEY")

# Check if keys loaded
print("OPENROUTER_API_KEY:", os.getenv("OPENROUTER_API_KEY"))
print("SERP_API_KEY:", SERP_API_KEY)

def run_search(query, limit=15):
    from dotenv import load_dotenv
    load_dotenv(dotenv_path="/workspace/siriusAI/temp1/project-root/src/latest_ai_development/.env")
    SERP_API_KEY = os.getenv("SERP_API_KEY")

    if not SERP_API_KEY:
        raise ValueError("‚ùå SERP_API_KEY not found in .env file")


    print(f"\nüîç Starting search for query: {query}")
    all_urls = set()
    results_data = []

    for search_type in ["search", "news"]:
        print(f"‚û°Ô∏è Running {search_type} search...")

        params = {
            "api_key": SERP_API_KEY,
            "engine": "google",
            "q": query,
            "type": search_type,
            "google_domain": "google.com",
            "gl": "us",
            "hl": "en"
        }

        try:
            search = GoogleSearch(params)
            results = search.get_dict()
            print(f"‚úÖ Got response for {search_type} search.")

            organic_results = results.get("organic_results", [])
            print(f"üîé Found {len(organic_results)} organic results in {search_type} search.")

            for result in organic_results:
                link = result.get("link")
                title = result.get("title", "No Title")
                snippet = result.get("snippet", "No Snippet")

                if link and link not in all_urls:
                    all_urls.add(link)
                    results_data.append({"title": title, "link": link, "snippet": snippet})
                    print(f"üîó Added: {link}")

                if len(all_urls) >= limit:
                    print("üö¶ Reached URL limit. Stopping early.")
                    break

        except Exception as e:
            print(f"‚ùå Error during {search_type} search: {e}")

        if len(all_urls) >= limit:
            break

    # Save to file
    os.makedirs("output", exist_ok=True)
    with open("output/search_results.json", "w") as f:
        json.dump(results_data, f, indent=2)

    print(f"‚úÖ Saved {len(results_data)} search results to output/search_results.json")
    return results_data
query= "Vjay India money laundering"
run_search(query, limit=15)


OPENROUTER_API_KEY: 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
SERP_API_KEY: 3e9990b0cbe2c42aab17c3d7d1732f83bdad666dac47c4075a2b597dfeaaa5fc

üîç Starting search for query: Vjay India money laundering
‚û°Ô∏è Running search search...
‚úÖ Got response for search search.
üîé Found 9 organic results in search search.
üîó Added: https://en.wikipedia.org/wiki/Vijay_Mallya
üîó Added: https://www.occrp.org/en/news/uk-indias-king-of-the-good-times-re-arrested-for-money-laundering
üîó Added: https://www.business-standard.com/about/who-is-vijay-mallya
üîó Added: https://www.hindustantimes.com/trending/indian-billionaire-publicly-backs-vijay-mallya-why-is-he-still-a-political-punching-bag-101749180494246.html
üîó Added: https://www.fortuneindia.com/business-news/call-me-a-fugitive-but-wheres-the-theft-vijay-mallya-on-debts-extradition-and-more/123859
üîó Added: https://m.economictimes.com/news/india/thief-or-no-thief-what-cases-say-about-vijay-mallya/article

[{'title': 'Vijay Mallya',
  'link': 'https://en.wikipedia.org/wiki/Vijay_Mallya',
  'snippet': 'The Enforcement Directorate of India also filed a money laundering case against him in March 2016 for allegedly sending abroad some ‚Çπ9 billion (US$110 million) ...'},
 {'title': "UK: India's 'King of the Good Times' Re-Arrested for Money ...",
  'link': 'https://www.occrp.org/en/news/uk-indias-king-of-the-good-times-re-arrested-for-money-laundering',
  'snippet': 'Indian tycoon Vijay Mallya was arrested in the UK on Tuesday for the second time this year on behalf of Indian authorities who allege that he diverted a state ...'},
 {'title': 'Who is Vijay Mallya, Vijay Mallya Case ...',
  'link': 'https://www.business-standard.com/about/who-is-vijay-mallya',
  'snippet': 'Mallya, who owes 17 Indian banks an estimated Rs 9,000 crore, is accused of fraud and money laundering in the country. Also a former Rajya Sabha member, Mallya ...'},
 {'title': "Indian billionaire publicly backs Vijay Mally

In [34]:
import os
import json
from dotenv import load_dotenv
from pydantic import BaseModel
from typing import List
from google import genai

# Load .env
load_dotenv(dotenv_path="/workspace/siriusAI/temp1/project-root/src/latest_ai_development/.env")

# Initialize Gemini client
client = genai.Client()

# Define output schema
class SummaryItem(BaseModel):
    url: str
    summary: str

def summarize(topic: str):
    """
    Summarize AML-related content from scraped data using Gemini (structured).
    Outputs JSON AML report.
    """
    print(f"[UTIL] Summarizing research on topic: {topic}")

    scraped_file = "output/scraped_data.json"
    if not os.path.exists(scraped_file):
        raise FileNotFoundError("‚ùå scraped_data.json not found. Run scraping first.")

    with open(scraped_file, "r") as f:
        scraped_results = json.load(f)

    summaries: List[SummaryItem] = []

    for idx, item in enumerate(scraped_results, start=1):
        url = item.get("url")
        content = item.get("content", "")

        if not content.strip():
            print(f"‚ö†Ô∏è Skipping {url}, no content found.")
            continue

        print(f"‚û°Ô∏è Summarizing content from URL {idx}/{len(scraped_results)}: {url}")

        prompt = f"""
You are a helpful assistant for Anti-Money Laundering (AML) investigators.

Your job is to summarize the following webpage content, focusing only on:
- Money laundering
- Financial fraud
- Suspicious financial activity

Return exactly two fields:
1. `url`: The page URL.
2. `summary`: The extracted summary related to AML. If nothing is relevant, return "No relevant AML content found."

### URL:
{url}

### Page Content:
{content[:2000]}
"""

        try:
            response = client.models.generate_content(
                model="gemini-2.5-flash",
                contents=prompt,
                config={
                "response_mime_type": "application/json",
                "response_schema": SummaryItem,
                }
            )


            summary_item: SummaryItem = response.parsed
            summaries.append(summary_item)
            print(f"‚úÖ Summary generated for {url}")

        except Exception as e:
            print(f"‚ùå Error summarizing {url}: {e}")
            summaries.append(SummaryItem(url=url, summary=f"Error: {e}"))

    # Save final AML report
    aml_report = {
        "topic": topic,
        "total_urls": len(scraped_results),
        "summaries": [item.model_dump() for item in summaries]

    }

    os.makedirs("output", exist_ok=True)
    with open("output/aml_report.json", "w") as f:
        json.dump(aml_report, f, indent=2)

    print("[UTIL] ‚úÖ AML report saved to output/aml_report.json")
    return aml_report

topic="Anti Money Laundering"
summarize(topic)


[UTIL] Summarizing research on topic: Anti Money Laundering
‚û°Ô∏è Summarizing content from URL 1/8: https://www.occrp.org/en/news/uk-indias-king-of-the-good-times-re-arrested-for-money-laundering
‚úÖ Summary generated for https://www.occrp.org/en/news/uk-indias-king-of-the-good-times-re-arrested-for-money-laundering
‚û°Ô∏è Summarizing content from URL 2/8: https://www.hindustantimes.com/trending/indian-billionaire-publicly-backs-vijay-mallya-why-is-he-still-a-political-punching-bag-101749180494246.html
‚úÖ Summary generated for https://www.hindustantimes.com/trending/indian-billionaire-publicly-backs-vijay-mallya-why-is-he-still-a-political-punching-bag-101749180494246.html
‚û°Ô∏è Summarizing content from URL 3/8: https://www.fortuneindia.com/business-news/call-me-a-fugitive-but-wheres-the-theft-vijay-mallya-on-debts-extradition-and-more/123859
‚úÖ Summary generated for https://www.fortuneindia.com/business-news/call-me-a-fugitive-but-wheres-the-theft-vijay-mallya-on-debts-extradition

{'topic': 'Anti Money Laundering',
 'total_urls': 8,
 'summaries': [{'url': 'https://www.occrp.org/en/news/uk-indias-king-of-the-good-times-re-arrested-for-money-laundering',
   'summary': 'Indian tycoon Vijay Mallya was re-arrested in the UK on charges of money laundering. Authorities allege he diverted a state loan intended for his airline to fund his Formula 1 team, indicating suspicious financial activity and potential financial fraud.'},
  {'url': 'https://www.hindustantimes.com/trending/indian-billionaire-publicly-backs-vijay-mallya-why-is-he-still-a-political-punching-bag-101749180494246.html',
   'summary': 'No relevant AML content found.'},
  {'url': 'https://www.fortuneindia.com/business-news/call-me-a-fugitive-but-wheres-the-theft-vijay-mallya-on-debts-extradition-and-more/123859',
   'summary': 'Vijay Mallya, a fugitive businessman, has responded to allegations of theft and money laundering. He is facing extradition to India and has addressed the various charges leveled aga