In [14]:
from openai import OpenAI
import json

client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")

# model = "phi-4"
# model = "qwen2.5-14b-instruct-mlx"
model = "meta-llama-3.1-8b-instruct"
comment = """
* Azure Blob File System implementation of AbstractFileSystem.
* This impl delegates to the old FileSystem
"""
format = {
    "type": "json_schema",
    "json_schema": {
        "name": "Segmented and Classified Code Comment",
        "schema": {
            "type": "object",
            "properties": {
    "summary": {
      "type": "array",
      "description": "Comment describes a brief description of the code. It answers the 'what' of the code.",
      "items": {
        "type": "string"
      }
    },
    "expand": {
      "type": "array",
      "description": "Comment provides more details about the code's behavior. It answers the 'how' of the code.",
      "items": {
        "type": "string"
      }
    },
    "rationale": {
      "type": "array",
      "description": "Comment explains the reasoning behind certain choices, patterns, or options in the code. It answers the 'why' of the code.",
      "items": {
        "type": "string"
      }
    },
    "deprecation": {
      "type": "array",
      "description": "Comment contains explicit warnings regarding deprecated artifacts, alternative suggestions, or future deprecation notes (including tags like @deprecated, @version, or @since).",
      "items": {
        "type": "string"
      }
    },
    "usage": {
      "type": "array",
      "description": "Comment includes explicit suggestions, use cases, examples, or code snippets aimed at the user (often marked with metadata such as @usage, @param, or @return).",
      "items": {
        "type": "string"
      }
    },
    "ownership": {
      "type": "array",
      "description": "Comment details authorship, credentials, or external references about the developers (e.g., using the @author tag).",
      "items": {
        "type": "string"
      }
    },
    "pointer": {
      "type": "array",
      "description": "Comment contains references to linked resources, external references, or tags such as @see, @link, @url, or even identifiers like FIX #2611.",
      "items": {
        "type": "string"
      }
    },
    "other": {
      "type": "array",
      "description": "for any comment that doesn't fit any other comment type.",
      "items": {
        "type": "string"
      }
    }
  },
            "required": []
        },
    }
}

# Define the conversation with the AI
messages = [
    {"role": "system", "content": "You are a helpful AI assistant."},
    {"role": "user", "content": f"""Segment and classify the following code comment into a given taxonomy of code comment categories (summary, expand, rationale, deprecation, usage, ownership, pointer, other).
Here's a brief description of each category:

- **summary**: The comment describes a brief description of the code. It answers the "what" of the code.
- **expand**: Similar to the summary category, this label indicates that the comment provides a more detailed description of the code. It answers the "how" of the code.
- **rationale**: The comment explains the reasoning behind certain choices, patterns, or options in the code. It answers the "why" of the code.
- **deprecation**: The comment contains explicit warnings regarding deprecated interface artifacts. It includes information about alternative methods or classes (e.g., “do not use [this]”, “is it safe to use?” or “refer to: [ref]”), future deprecation plans, or scheduled changes. Tags like @version, @deprecated, or @since may also be present.
- **usage**: The comment offers explicit suggestions, examples, or use cases for users planning to use a functionality. It might include code snippets or metadata marks such as @usage, @param, or @return.
- **ownership**: The comment identifies the authors or ownership details, possibly including external references or credentials (commonly marked with @author).
- **pointer**: The comment contains references to linked resources, using tags like @see, @link, or @url, or even identifiers such as “FIX #2611” or “BUG #82100.”
- **other**: Use this category for any comment that doesn't fit into any of the above types.

The comment could in its entirety (or parts of it), belong to none, one, multiple, or every category. Any segment of the given text should not be classified to more than one category. If no category fits, use the 'other' category.
Here's the comment:
\"\"\"
{comment}
\"\"\""""}
]

# Get response from AI
response = client.chat.completions.create(
    model="phi-4",
    messages=messages,
    response_format=format,
)

# Parse and display the results
content = response.choices[0].message.content

try:
    results = json.loads(content)
except json.JSONDecodeError as e:
    results = {
        "error": str(e),
        "response": content
    }

print(json.dumps(results, indent=2))

{
  "summary": [
    "Azure Blob File System implementation of AbstractFileSystem."
  ],
  "expand": [],
  "rationale": [
    "This impl delegates to the old FileSystem"
  ],
  "deprecation": [],
  "usage": [],
  "ownership": [],
  "pointer": [],
  "other": []
}


In [15]:
from openai import OpenAI
import json

client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")

def classify(comment: str, model_id: str):
    messages = [
    {"role": "system", "content": "You are a helpful AI assistant."},
    {"role": "user", "content": f"""Segment and classify the following code comment into a given taxonomy of code comment categories (summary, expand, rationale, deprecation, usage, ownership, pointer, other).
Here's a brief description of each category:

- **summary**: The comment describes a brief description of the code. It answers the "what" of the code.
- **expand**: Similar to the summary category, this label indicates that the comment provides a more detailed description of the code. It answers the "how" of the code.
- **rationale**: The comment explains the reasoning behind certain choices, patterns, or options in the code. It answers the "why" of the code.
- **deprecation**: The comment contains explicit warnings regarding deprecated interface artifacts. It includes information about alternative methods or classes (e.g., “do not use [this]”, “is it safe to use?” or “refer to: [ref]”), future deprecation plans, or scheduled changes. Tags like @version, @deprecated, or @since may also be present.
- **usage**: The comment offers explicit suggestions, examples, or use cases for users planning to use a functionality. It might include code snippets or metadata marks such as @usage, @param, or @return.
- **ownership**: The comment identifies the authors or ownership details, possibly including external references or credentials (commonly marked with @author).
- **pointer**: The comment contains references to linked resources, using tags like @see, @link, or @url, or even identifiers such as “FIX #2611” or “BUG #82100.”
- **other**: Use this category for any comment that doesn't fit into any of the above types.

The comment could in its entirety (or parts of it), belong to none, one, multiple, or every category. Any segment of the given text should not be classified to more than one category. If no category fits, use the 'other' category.
Here's the comment:
\"\"\"
{comment}
\"\"\""""}
]

    # Get response from AI
    response = client.chat.completions.create(
        model=model_id,
        messages=messages,
        response_format=format,
    )

    # Parse and display the results
    content = response.choices[0].message.content

    try:
        results = json.loads(content)
    except json.JSONDecodeError as e:
        results = {
            "error": str(e),
            "response": content
        }
    return results

In [16]:
c = """* Test getFileStatus and related listing operations.
| the tree parameters. Kept small to avoid killing object store test| runs too much.|
* Accept everything.
| the tree parameters. Kept small to avoid killing object store test| runs too much.|
* Accept nothing.
| the tree parameters. Kept small to avoid killing object store test| runs too much.|
* Path filter which only expects paths whose final name element
* equals the {@code match} field.
| the tree parameters. Kept small to avoid killing object store test| runs too much.|
* A filesystem filter which exposes the protected method
* {@link #listLocatedStatus(Path, PathFilter)}."""
r = classify(comment=c, model_id="phi-4")


In [17]:
print(json.dumps(r, indent=2))

{
  "summary": [
    "* Test getFileStatus and related listing operations."
  ],
  "rationale": [
    "| the tree parameters. Kept small to avoid killing object store test| runs too much.|",
    "* Path filter which only expects paths whose final name element",
    "| the tree parameters. Kept small to avoid killing object store test| runs too much.|"
  ],
  "pointer": [
    "* A filesystem filter which exposes the protected method",
    "* {@link #listLocatedStatus(Path, PathFilter)}."
  ],
  "other": [
    "* Accept everything.",
    "* Accept nothing."
  ]
}


In [15]:
from java import java_types
import pandas as pd
import json
from tqdm import tqdm
from similarity import find_most_similar

data_path = "java_0_raw.csv"

print(f"reading {data_path}..")
df = pd.read_csv(data_path)
df.columns = df.columns.str.lower()
df.rename(columns={'rational': 'rationale'}, inplace=True)
not_found = []
for type in java_types:
    found = False
    for col in df.columns:
        if type == col:
            found = True
            break
    if not found:
        not_found.append(type)
print(f"not found: {not_found}")


for index, row in tqdm(df.iterrows(), total=len(df)):
    class_name = row["class"]
    comment = row["comment"]
    comment_lines = comment.split("\n")
    # trying to associate each line with each category
    associations_by_type = {}
    lines_found_at_least_once = set()
    all_candidate_lines = set()
    for type in java_types:
        if type == "other":
            continue
        associations_by_type[type] = []
    for type in java_types:
        if type == "other":
            continue
        if not isinstance(row[type], str):
            continue
        type_comments = row[type].split("\n")
        for type_line in type_comments:
            all_candidate_lines.add(type_line)
            for line in comment_lines:
                if line.strip() == type_line.strip():
                    associations_by_type[type].append(line)
                    lines_found_at_least_once.add(line)
    missing_lines = set(comment_lines) - lines_found_at_least_once
    missing_info = []
    all_scores = []
    if missing_lines:
        # print(f"{len(missing_lines)} lines missing!")
        for missing_line in missing_lines:
            # print(f"{missing_line}")
            most_similar_line, score = find_most_similar(missing_line, [str(s) for s in all_candidate_lines])
            # print(f"most similar: {most_similar_line} [{score:.4f}]")
            all_scores.append(score)
            missing_info.append({
                "source": missing_line,
                "similar": most_similar_line,
                "score": score,
            })





reading java_0_raw.csv..
not found: ['other']


 24%|██▍       | 91/376 [06:31<20:24,  4.30s/it]  


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x768 and 0x1)

In [16]:
from bench2 import get_json_format
from java import java_types
import json

f = get_json_format(java_types)
print(json.dumps(f, indent=2))

{
  "type": "json_schema",
  "json_schema": {
    "name": "line_classifications",
    "schema": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "id": {
            "type": "integer",
            "description": "A unique identifier for the line."
          },
          "category": {
            "type": "string",
            "description": "The classification category for the line.",
            "enum": [
              "summary",
              "expand",
              "rationale",
              "deprecation",
              "usage",
              "ownership",
              "pointer",
              "other"
            ]
          }
        },
        "required": [
          "id",
          "category"
        ],
        "additionalProperties": false
      }
    }
  }
}
