In [13]:
import github3
from dotenv import load_dotenv, dotenv_values
import libhoney
import os

load_dotenv()

github = github3.login(token=os.getenv('GITHUB_TOKEN'))
libhoney.init(writekey=os.getenv('HONEYCOMB_API_KEY'), dataset='otel-github-issues')

In [14]:
js_issues = github.issues_on(username='open-telemetry', repository='opentelemetry-js', state='open')
js_issues_filtered = [issue for issue in js_issues if "renovate-bot" not in issue.user.login and "forking-renovate[bot]" not in issue.user.login]
js_issues_json = [issue.as_json() for issue in js_issues_filtered]

In [15]:
js_contrib_issues = github.issues_on(username='open-telemetry', repository='opentelemetry-js-contrib', state='open')
js_contrib_issues_filtered = [issue for issue in js_contrib_issues if "renovate-bot" not in issue.user.login and "forking-renovate[bot]" not in issue.user.login]
js_contrib_issues_json = [issue.as_json() for issue in js_contrib_issues_filtered]

In [16]:
py_issues = github.issues_on(username='open-telemetry', repository='opentelemetry-python', state='open')
py_issues_json = [issue.as_json() for issue in py_issues]

In [17]:
py_contrib_issues = github.issues_on(username='open-telemetry', repository='opentelemetry-python-contrib', state='open')
py_contrib_issues_json = [issue.as_json() for issue in py_contrib_issues]

In [18]:
java_issues = github.issues_on(username='open-telemetry', repository='opentelemetry-java', state='open')
java_issues_filtered = [issue for issue in java_issues if "renovate-bot" not in issue.user.login and "forking-renovate[bot]" not in issue.user.login]
java_issues_json = [issue.as_json() for issue in java_issues_filtered]

In [19]:
java_contrib_issues = github.issues_on(username='open-telemetry', repository='opentelemetry-java-contrib', state='open')
java_contrib_issues_filtered = [issue for issue in java_contrib_issues if "renovate-bot" not in issue.user.login and "forking-renovate[bot]" not in issue.user.login]
java_contrib_issues_json = [issue.as_json() for issue in java_contrib_issues_filtered]

In [20]:
java_instrumentation_issues = github.issues_on(username='open-telemetry', repository='opentelemetry-java-instrumentation', state='open')
java_instrumentation_issues_filtered = [issue for issue in java_instrumentation_issues if "renovate-bot" not in issue.user.login and "forking-renovate[bot]" not in issue.user.login]
java_instrumentation_issues_json = [issue.as_json() for issue in java_instrumentation_issues_filtered]

In [21]:
go_issues = github.issues_on(username='open-telemetry', repository='opentelemetry-go', state='open')
go_issues_filtered = [issue for issue in go_issues if "renovate-bot" not in issue.user.login and "forking-renovate[bot]" not in issue.user.login]
go_issues_json = [issue.as_json() for issue in go_issues_filtered]

In [22]:
go_contrib_issues = github.issues_on(username='open-telemetry', repository='opentelemetry-go-contrib', state='open')
go_contrib_issues_filtered = [issue for issue in go_contrib_issues if "renovate-bot" not in issue.user.login and "forking-renovate[bot]" not in issue.user.login]
go_contrib_issues_json = [issue.as_json() for issue in go_contrib_issues_filtered]

In [78]:
from openai import OpenAI
from pydantic import BaseModel, Field, model_validator, FieldValidationInfo
from typing import List
import instructor
import concurrent.futures
from tqdm import tqdm
import re
from loguru import logger

client = instructor.from_openai(OpenAI())

class Library(BaseModel):
    name: str = Field(..., description="A single library mentioned in the issue title or body.")
    library_source: list[str] = Field(..., description="A quoted string from the issue title or body that mentions the library.")

    @model_validator(mode="after")
    def validate_library_source(self, info: FieldValidationInfo) -> "Library":
        if info.context is None:
            raise ValueError("text_chunk is required in the validation context.")
        chunks = info.context.get("text_chunk", None)
        spans = list(self.get_spans(chunks))
        logger.info(f"Found {len(spans)} spans for {len(self.library_source)} library sources.")
        self.library_source = [chunks[span[0]:span[1]] for span in spans]
        return self
    
    def get_spans(self, context):
        for quote in self.library_source:
            yield from self._get_span(quote, context)
    
    def _get_span(self, quote, context):
        for match in re.finditer(re.escape(quote), context):
            yield match.span()


class Issue(BaseModel):
    title: str = Field(..., description="The title of the issue.")
    id: int = Field(..., description="The ID of the issue.")
    url: str = Field(..., description="The URL of the issue.")
    libraries: list[Library] = Field(..., description="A list of libraries mentioned in the issue title or body, in lower case. Each library has a name and a list of quoted strings from the issue title or body that mention the library.")
    detected_cloud_providers: List[str] = Field(..., description="A list of cloud providers mentioned in the issue, or inferred from the issue body.")
    body: str = Field(..., description="The body of the issue.")
    body_summary: str = Field(..., description="A 500 character summary of the issue body.")
    comment_count: int = Field(..., description="The number of comments on the issue.")
    updated_at: str = Field(..., description="The date and time the issue was last updated.")
    positive_reactions: int = Field(..., description="The number of positive reactions on the issue.")
    negative_reactions: int = Field(..., description="The number of negative reactions on the issue.")
    inferred_sentiment: str = Field(..., description="The sentiment of the issue body (one of 'positive', 'negative', or 'neutral').")
    source_repo: str = Field(..., description="The name of the repository where the issue was opened.")
    labels: List[str] = Field(..., description="A list of labels on the issue.")

    @model_validator(mode="after")
    def validate_libraries(self) -> "Issue":
        logger.info(f"Validating {len(self.libraries)} libraries.")
        self.libraries = [library for library in self.libraries if len(library.library_source) > 0]
        logger.info(f"Filtered down to {len(self.libraries)} libraries.")
        return self

responses = []

def process_issue(issue):
    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0,
        messages=[
            {
                "role": "system",
                "content": "You are a system that parses GitHub issues in JSON format and extracts data from them."
            },
            {
                "role": "user",
                "content": issue
            }
        ],
        response_model=Issue,
        validation_context={"text_chunk": issue},
    )
    return resp

# Expects a list of JSON objects, each representing a GitHub issue
def run_issue_process(issues):
    responses = []
    total_tasks = len(issues)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_issue, json_content): json_content for json_content in issues}
        completed_tasks = 0

        for future in tqdm(concurrent.futures.as_completed(futures), total=total_tasks):
            try:
                response = future.result()
                responses.append(response)
            except Exception as exc:
                print(f'Generated an exception: {exc}')
            completed_tasks += 1
    return responses

In [None]:
for issue in py_contrib_issues_json:
    resp = process_issue(issue)
    print(resp)

In [None]:
#js_responses = run_issue_process(js_issues_json)
#js_contrib_responses = run_issue_process(js_contrib_issues_json)
#py_responses = run_issue_process(py_issues_json)
#py_contrib_responses = run_issue_process(py_contrib_issues_json)
#java_responses = run_issue_process(java_issues_json)
java_contrib_responses = run_issue_process(java_contrib_issues_json)
#java_instrumentation_responses = run_issue_process(java_instrumentation_issues_json)
#go_responses = run_issue_process(go_issues_json)
#go_contrib_responses = run_issue_process(go_contrib_issues_json)


In [64]:
import libhoney

def send_to_honeycomb(response):
    evt = libhoney.Event()
    for field, value in response.dict().items():
        if isinstance(value, list):
            for item in value:
                if isinstance(item, list):
                    for library in item:
                        for library_field, library_value in library.dict().items():
                            evt.add_field(library_field, library_value)
                else:
                    evt.add_field(field, item)
        else:
            evt.add_field(field, value)
    evt.send()

for response in java_contrib_responses:
    send_to_honeycomb(response)

    

In [79]:
import pprint

issue_id = 1071554210
issue = next((issue for issue in java_contrib_issues_filtered if issue.id == issue_id), None)
pprint.pp(issue.as_dict())
issue_json = issue.as_json()
res = process_issue(issue_json)
pprint.pp(res.dict())
send_to_honeycomb(res)

{'url': 'https://api.github.com/repos/open-telemetry/opentelemetry-java-contrib/issues/145',
 'repository_url': 'https://api.github.com/repos/open-telemetry/opentelemetry-java-contrib',
 'labels_url': 'https://api.github.com/repos/open-telemetry/opentelemetry-java-contrib/issues/145/labels{/name}',
 'comments_url': 'https://api.github.com/repos/open-telemetry/opentelemetry-java-contrib/issues/145/comments',
 'events_url': 'https://api.github.com/repos/open-telemetry/opentelemetry-java-contrib/issues/145/events',
 'html_url': 'https://github.com/open-telemetry/opentelemetry-java-contrib/issues/145',
 'id': 1071554210,
 'node_id': 'MDU6SXNzdWUxMDcxNTU0MjEw',
 'number': 145,
 'title': 'Support export via kafka',
 'user': {'login': 'ypatent',
          'id': 19263644,
          'node_id': 'MDQ6VXNlcjE5MjYzNjQ0',
          'avatar_url': 'https://avatars.githubusercontent.com/u/19263644?v=4',
          'gravatar_id': '',
          'url': 'https://api.github.com/users/ypatent',
          'htm

2024-07-26 15:44:09.958 | INFO     | __main__:validate_library_source:22 - Found 7 spans for 2 library sources.
2024-07-26 15:44:09.958 | INFO     | __main__:validate_library_source:22 - Found 6 spans for 2 library sources.
2024-07-26 15:44:09.959 | INFO     | __main__:validate_libraries:58 - Validating 2 libraries.
2024-07-26 15:44:09.959 | INFO     | __main__:validate_libraries:60 - Filtered down to 2 libraries.


['export via kafka', 'publish messages to kafka']
['export via kafka', 'publish messages to kafka']
['open telemetry collector project', 'open telemetry collector']
['open telemetry collector project', 'open telemetry collector']
{'title': 'Support export via kafka',
 'id': 1071554210,
 'url': 'https://github.com/open-telemetry/opentelemetry-java-contrib/issues/145',
 'libraries': [{'name': 'kafka',
                'library_source': ['publish messages to kafka',
                                   'publish messages to kafka',
                                   'publish messages to kafka',
                                   'publish messages to kafka',
                                   'publish messages to kafka',
                                   'publish messages to kafka',
                                   'export via kafka']},
               {'name': 'open telemetry',
                'library_source': ['open telemetry collector',
                                   'open telemetry 

In [74]:
send_to_honeycomb(res)