In [None]:
import openai
import os
client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))


In [None]:
import json
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from openai import OpenAI
from collections import Counter
from numpy.linalg import norm

# Initialize OpenAI client

# Example grant descriptions (REPLACE WITH YOUR DATA)
grant_descriptions = [
    "This grant supports innovative research in renewable energy with a focus on community-driven sustainability projects.",
    "Funding opportunity for digital education innovation and the development of cutting-edge online learning platforms.",
    "This grant is dedicated to advancing healthcare research, focusing on improving patient outcomes and innovative medical technologies.",
    "Supports community development projects that promote local economic growth and social entrepreneurship.",
    "Grants for research in artificial intelligence, machine learning, and their applications in real-world problem solving.",
    # ... more grant descriptions
]

def get_openai_embedding(text, model="text-embedding-3-small"):
    response = client.embeddings.create(input=[text], model=model)
    return response.data[0].embedding

# 1. Compute Embeddings
embeddings = []
for description in grant_descriptions:
    embedding = get_openai_embedding(description)
    embeddings.append(embedding)
embeddings = np.array(embeddings).astype("float32")

# 2. Determine Optimal k and Cluster
def find_optimal_clusters(embeddings, min_k=2, max_k=10):
    best_k = min_k
    best_score = -1
    for k in range(min_k, min(max_k + 1, len(embeddings))):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
        labels = kmeans.fit_predict(embeddings)
        score = silhouette_score(embeddings, labels)
        print(f"k={k}, silhouette score={score:.4f}")
        if score > best_score:
            best_score = score
            best_k = k
    return best_k

optimal_k = find_optimal_clusters(embeddings)
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init='auto')
labels = kmeans.fit_predict(embeddings)
clusters = {i: [] for i in range(optimal_k)}
for idx, label in enumerate(labels):
    clusters[label].append(grant_descriptions[idx])

# 3. Generate Categories Dynamically
def generate_categories(cluster_texts, max_words=5):
    prompt = f"""Summarize the following grant descriptions and extract {max_words} key concepts or themes that best represent the overall topic of these grants. Return them as a comma-separated list.

    Grant Descriptions:
    {"\n\n".join(cluster_texts)}

    Key Concepts/Themes:"""

    completion = client.chat.completions.create(
        model="gpt-4", #or gpt-3.5-turbo
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3,
    )
    categories_string = completion.choices[0].message.content.strip()
    categories = [cat.strip() for cat in categories_string.split(",")] # Split into list
    return categories

# 4. Label Clusters & Generate Categories
cluster_labels = {}
for cluster_id, texts in clusters.items():
    categories = generate_categories(texts)
    cluster_labels[cluster_id] = categories

    print(f"\nCluster {cluster_id} - Categories: {categories}")
    for text in texts:
        print(f" - {text}")

def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
    """
    Compute the cosine similarity between two vectors.
    """
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2) + 1e-8)

# 5. Extract Features Based on Generated Categories
def extract_features_dynamic(description, categories):
    features = {}
    for category in categories:
        prompt = f"""Rate the following description from 0 to 1 on how relevant it is to the category: {category}.

        Description:
        {description}

        Rating (0-1):"""

        completion = client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.3,
        )
        try:
            rating = float(completion.choices[0].message.content.strip())
            features[category] = rating
        except (ValueError, TypeError):  # Handle both ValueError and TypeError
            print(f"Warning: Could not convert rating for {category}. Setting to 0.0")
            features[category] = 0.0
        except Exception as e:
            print(f"Warning: Error getting rating for {category}: {e}. Setting to 0.0")
            features[category] = 0.0
    return features


# 6. Project Proposal and Feature Extraction
project_proposal = """
Our project aims to develop an AI-powered system for early disease detection using cutting-edge machine learning algorithms.  We focus on improving patient outcomes in underserved communities.
"""

# Find the most similar cluster (for dynamic category selection)
project_embedding = get_openai_embedding(project_proposal)
best_cluster_id = -1
highest_similarity = -1

for cluster_id, texts in clusters.items():
    #Average the embeddings of the grants in each cluster
    cluster_embeddings = [get_openai_embedding(text) for text in texts]
    cluster_embedding = np.mean(cluster_embeddings, axis = 0)
    similarity = cosine_similarity(project_embedding, cluster_embedding)

    if similarity > highest_similarity:
        highest_similarity = similarity
        best_cluster_id = cluster_id

if best_cluster_id != -1:
    relevant_categories = cluster_labels[best_cluster_id]
    project_features = extract_features_dynamic(project_proposal, relevant_categories)
    project_vector = np.array(list(project_features.values())) # Convert to numpy array

    # 7. Calculate Similarity (example with one grant)
    grant_index = 0  # Choose which grant to compare to
    grant_categories = cluster_labels[labels[grant_index]] # Get the categories for the grant
    grant_features = extract_features_dynamic(grant_descriptions[grant_index], grant_categories)
    grant_vector = np.array(list(grant_features.values())) # Convert to numpy array
    similarity_score = cosine_similarity(project_vector, grant_vector)

    print("\nProject Features:", project_features)
    print("Similarity Score:", similarity_score)

else:
    print("No suitable cluster found for the project proposal.")


def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
    """Compute the cosine similarity between two vectors."""
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2) + 1e-8)


# ... (Saving to JSON remains the same - adapt as needed)
output = {
    "cluster_labels": cluster_labels,
    "clusters": clusters
}

with open("grant_clusters.json", "w") as f:
    json.dump(output, f, indent=2, default=list)

k=2, silhouette score=0.0683
k=3, silhouette score=0.0162
k=4, silhouette score=0.0124

Cluster 0 - Categories: ['Digital Education Innovation', 'Online Learning Platforms', 'Healthcare Research', 'Artificial Intelligence and Machine Learning Research', 'Real-World Problem Solving Applications']
 - Funding opportunity for digital education innovation and the development of cutting-edge online learning platforms.
 - This grant is dedicated to advancing healthcare research, focusing on improving patient outcomes and innovative medical technologies.
 - Grants for research in artificial intelligence, machine learning, and their applications in real-world problem solving.

Cluster 1 - Categories: ['Innovative Research', 'Renewable Energy', 'Community-Driven Sustainability', 'Local Economic Growth', 'Social Entrepreneurship']
 - This grant supports innovative research in renewable energy with a focus on community-driven sustainability projects.
 - Supports community development projects that