In [None]:
import requests

class MissingTokenError(ValueError):
    """Custom exception for missing GitHub token."""
    pass

def fetch_repos_issues_by_name(repo_name: str, owner: str, issue_count=5, token=None):
    """
    Fetch issues from a GitHub repository, sorted by creation date in descending order.

    Args:
        repo_name (str): The name of the repository.
        owner (str): The owner of the repository.
        issue_count (int): Number of issues to fetch. Default is 10.
        token (str): GitHub personal access token.

    Returns:
        dict: JSON response containing the issues.

    Raises:
        MissingTokenError: If the token is not provided.
        Exception: If the request fails for any other reason.
    """
    if token is None:
        raise MissingTokenError("GitHub token is required to authenticate the request.")

    query = f"""
    query {{
      repository(owner: "{owner}", name: "{repo_name}") {{
        issues(first: {issue_count}, orderBy: {{ field: CREATED_AT, direction: DESC }}) {{
          edges {{
            node {{
              title
              number
              createdAt
              url
              bodyText
              comments(first: 10) {{
                edges {{
                  node {{
                    author {{
                      login
                    }}
                    bodyText
                    createdAt
                  }}
                }}
              }}
            }}
          }}
        }}
      }}
    }}
    """

    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json"
    }

    response = requests.post(
        "https://api.github.com/graphql",
        json={"query": query},
        headers=headers
    )

    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Request failed with status code {response.status_code}: {response.json()}")


In [38]:
results = fetch_repos_issues_by_name('scikit-learn', 'scikit-learn', token=None)
print(results)



In [13]:
!pip install openai langchain pinecone-client


Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone_client-5.0.1-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_inference-1.1.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-plugin-inference, pinecone-client
Successfully installed pinecone-client-5.0

In [17]:
from pinecone import Pinecone,ServerlessSpec

# Initialize Pinecone
pc = Pinecone(api_key=None,  # Replace with your Pinecone API key
)

# # Create or connect to an index

# if index_name not in pinecone.list_indexes():
#     pinecone.create_index(index_name, dimension=1536)  # Use 1536 for OpenAI embeddings
# index = pinecone.Index(index_name)

index_name = "github-issues"
pc.create_index(
    name=index_name,
    dimension=1536, # Replace with your model dimensions
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)
index = pc.Index(index_name)

In [19]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.15-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.15 (from langchain-community)
  Downloading langchain-0.3.15-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.31 (from langchain-community)
  Downloading langchain_core-0.3.31-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.0-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-

In [24]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [80]:
from langchain.embeddings import OpenAIEmbeddings
from pinecone import Index

def process_issues_and_store_hybrid(issues, index: Index):
    """
    Process GitHub issues and comments, and store their embeddings in Pinecone using a hybrid approach.

    Args:
        repo_name (str): The name of the GitHub repository.
        owner (str): The owner of the GitHub repository.
        token (str): GitHub personal access token.
        index (Index): Pinecone index instance to store embeddings.

    Returns:
        None
    """

    # Initialize the OpenAI Embeddings model
    embeddings = OpenAIEmbeddings(model="text-embedding-ada-002",openai_api_key=None)

    for issue in issues["data"]["repository"]["issues"]["edges"]:
        issue_node = issue["node"]
        issue_title = issue_node["title"]
        issue_body = issue_node["bodyText"]
        issue_url = issue_node["url"]
        issue_id = issue_node["number"]

        # Embed issue (title + body)
        issue_text = f"Title: {issue_title}\n\nBody: {issue_body}"
        issue_vector = embeddings.embed_query(issue_text)

        # Store issue embedding
        issue_metadata = {
            "type": "issue",
            "title": issue_title,
            "url": issue_url,
            "body": issue_body,
            "createdAt": issue_node["createdAt"],
            "number": issue_id
        }
        index.upsert([(str(issue_id), issue_vector, issue_metadata)])

        # Process and embed comments
        comments = issue_node["comments"]["edges"]
        comment_vectors = []
        comment_texts = []

        for comment in comments:
            comment_node = comment["node"]
            comment_text = comment_node["bodyText"]
            comment_author = comment_node["author"]["login"]
            comment_created_at = comment_node["createdAt"]

            # Skip empty or invalid comments
            if not comment_text or len(comment_text) < 10:
                continue

            # Embed comment
            comment_vector = embeddings.embed_query(comment_text)

            # Prepare comment metadata
            comment_metadata = {
                "type": "comment",
                "comment_text": comment_text,
                "author": comment_author,
                "createdAt": comment_created_at,
                "issue_id": issue_id,
                "issue_title": issue_title
            }
            comment_vectors.append((f"comment-{comment_created_at}", comment_vector, comment_metadata))
            comment_texts.append(f"- {comment_author}: {comment_text}")

        # Store comment embeddings
        if comment_vectors:
            index.upsert(comment_vectors)

        # Embed combined text (issue + comments)
        combined_text = f"Title: {issue_title}\n\nBody: {issue_body}\n\nComments:\n" + "\n".join(comment_texts)
        combined_vector = embeddings.embed_query(combined_text)

        # Store combined embedding
        combined_metadata = {
            "type": "combined",
            "title": issue_title,
            "url": issue_url,
            "body": issue_body,
            "createdAt": issue_node["createdAt"],
            "number": issue_id
        }
        index.upsert([(f"combined-{issue_id}", combined_vector, combined_metadata)])

    print("Issues, comments, and combined embeddings stored successfully!")


In [81]:
process_issues_and_store_hybrid(results, index=index)

Issues, comments, and combined embeddings stored successfully!


In [118]:
def hybrid_search(query_text, index, top_k=5):
    """
    Perform a hybrid search across issues, comments, and combined embeddings.

    Args:
        query_text (str): The search query.
        index (Index): Pinecone index instance to query.
        top_k (int): Number of top results to retrieve.

    Returns:
        dict: Results grouped by type (issues, comments, combined).
    """
    # Embed query
    embeddings = OpenAIEmbeddings(model="text-embedding-ada-002",openai_api_key=None)
    query_vector = embeddings.embed_query(query_text)

    # Query the index for each type
    results = {}
    for type_filter in ["issue","comment"]:
        # results[type_filter] = index.query(
        #     query_vector,
        #     top_k=top_k,
        #     include_metadata=True,
        #     filter={"type": type_filter}
        # )["matches"]
        results[type_filter] = index.query(
            vector=query_vector,
            top_k=1,
            include_metadata=True,
            filter={"type": type_filter}
        )["matches"]
    # results = index.query(
    # vector=query_vector,
    # top_k=3,
    # include_values=True,
    # filter={"type": "comment"}
    # )

    return results

In [94]:
outputs = hybrid_search(query_text="for this issue about Make binary display method parameters, have it resolved from the comment?", index=index)
# Display results
for category, matches in outputs.items():
    print(f"Results for {category}:")
    for m in matches:
        print(f" - Title: {m['metadata'].get('title')}")
        print(f"   URL: {m['metadata'].get('url')}")
        print(f"   Score: {m['score']}\n")


Results for issue:
 - Title: MNT Make binary display method parameters' order consistent
   URL: https://github.com/scikit-learn/scikit-learn/issues/30717
   Score: 0.757070601

Results for comment:
 - Title: None
   URL: None
   Score: 0.767722905

Results for combined:
 - Title: MNT Make binary display method parameters' order consistent
   URL: https://github.com/scikit-learn/scikit-learn/issues/30717
   Score: 0.736943364



In [119]:
from openai import OpenAI

def generate_response_from_retrieval(query, retrieval_results):
    """
    Generate a user-friendly response based on retrieval results.

    Args:
        query (str): User's query.
        retrieval_results (dict): Grouped retrieval results by categories.

    Returns:
        str: Generated response.
    """
    # Start building the context
    context = f"User Query: {query}\n\n"

    # Process each category in the retrieval results
    for category, results in retrieval_results.items():
        if results:  # Only include non-empty categories
            context += f"### {category.capitalize()}s:\n"
            for idx, match in enumerate(results, start=1):
                metadata = match.get("metadata", {})
                title = metadata.get("title", "N/A")
                body = metadata.get("body", "N/A")
                comment_text = metadata.get("comment_text", "No comments available")
                url = metadata.get("url", "No URL provided")
                created_at = metadata.get("createdAt", "Unknown date")

                # Add details for each result
                context += f"{idx}. Title: {title}\n"
                context += f"   Created At: {created_at}\n"
                if category == "comment":
                    context += f"   Comment: {comment_text}\n"
                else:
                    context += f"   Description: {body}\n"
                context += f"   URL: {url}\n\n"

    # Combine context into a prompt
    prompt = f"""
    Context:
    {context}

    Based on the above information, provide a helpful response to the user's query.
    """

    # Generate response using OpenAI GPT
    # response = openai.ChatCompletion.create(
    #     model="gpt-4",
    #     messages=[
    #         {"role": "system", "content": "You are a helpful assistant."},
    #         {"role": "user", "content": prompt}
    #     ]
    # )
    client = OpenAI(api_key=None)

    completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ])

    return completion.choices[0].message.content



In [123]:
query = "for this issue about OpenML, have it resolved from the comment? could you also provide me the issue link?"
retrieval_results=hybrid_search(query, index)
grouped_results = generate_response_from_retrieval(query,retrieval_results)
print(grouped_results)
# response = generate_response_from_grouped_results(query, grouped_results)
# print(response)




It appears that the issue regarding making scikit-learn's OpenML integration more generic for data download URLs has not yet been resolved based on the available comment. The comment suggests that the URLs in the dataset description should correctly point to the respective files (ARFF or Parquet), but doesn't confirm if there are any adjustments needed in scikit-learn's code or if it's purely an issue with OpenML's dataset links.

For more information or to check the current status of the issue, you can visit the issue page on GitHub using the following link: [Scikit-learn Issue #30699](https://github.com/scikit-learn/scikit-learn/issues/30699). If you have specific concerns or need more assistance, it may be helpful to engage directly on that issue page or in the related discussion.
