In [22]:
import pandas as pd
from tqdm import tqdm

In [23]:
df15 = pd.read_csv("/home/mkapadni/work/crscore_plus_plus/sft/checking_patch_length/n=0_pred_validdf.csv")
df15.head()

Unnamed: 0,index,patch,patch_context,original_msg,predicted_msg
0,0,"@@ -231,4 +231,8 @@ def setup_app(app):\n ...",@app.template_test('list')\n def _is_li...,Should we call it `is_list`?,The suggested code change defines a custom tem...
1,1,"@@ -44,7 +44,7 @@ namespace OpenTelemetry.Trac...",builder.AddSource(new Source(SqlCl...,"in the instrumentation example, should we use ...",The suggested code change adds a source to the...
2,2,"@@ -25,7 +25,7 @@ from scapy.modules.six.moves...",class Field(object):,Why this change ? Is it useful ?,"Sure, please provide the suggested code change..."
3,3,"@@ -0,0 +1,4 @@\n+const titleNode = virtualNod...",const titleNode = virtualNode.children.find(({...,"I know this is a nitpick, but don't we always ...",The suggested code change is intended to find ...
4,4,"@@ -37,6 +37,11 @@ public class EMailValidator...",/*\n Add tests for 4601\n ...,We should reformat this emails in the test to ...,The suggested code change adds test cases for ...


In [24]:
print(df15['predicted_msg'][0])

The suggested code change defines a custom template filter named `_is_list` that checks if a given value is a list. This can be useful in templates where you need to conditionally render content based on whether a variable is a list or not.

**Review:**
- **Functionality:** The function correctly uses `isinstance()` to determine if the input `value` is a list.
- **Naming Convention:** The function name `_is_list` is a good choice as it clearly indicates its purpose.
- **Docstring:** A docstring is included, which is helpful for understanding the function's behavior.
- **Consistency:** The use of underscores in the function name and variable names is consistent with Python conventions.

Overall, this change enhances the functionality of the Flask application by providing a reusable test for checking list types in templates.


In [25]:
import os
import json
import torch
from typing import List, Dict, Union, Any, Optional
from tqdm import tqdm
from collections import defaultdict

# Choose one of these based on your needs
from openai import OpenAI  # For GPT models
# For Magicoder or other transformer models
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Constants from original code
CODE_CHANGE_AND_REVIEW_SYSTEM_PROMPT = """You are a highly skilled software engineer who has a lot of experience reviewing code changes. Your task is to rate the relevance of any given code change"""

CODE_CHANGE_AND_REVIEW_JUDGE_PROMPT = """You will be asked to rate the relevance of reviews for given Python, Java or Javascript code changes. A relevant review is one which is both concise and comprehensive. A concise review contains very little text not related to the code change. A comprehensive review contains all the information about a code change that should be covered by a review. A relevant review is comprehensive while being concise.

Now look at the code change and review below and score the relevance of the review on a scale of 1 to 5

Code Change:
{code_change}

Review:
{review}

Your score: """

LANG_MAP = {
    "py": "Python",
    "js": "Javascript",
    "java": "Java",
}

class LLM_as_a_Judge:
    def __init__(self, model: str, api_key: Optional[str]=None):
        self.model = model
        if model.startswith("gpt"):
            self.client = OpenAI(api_key=api_key)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(model)
            self.client = pipeline(
                model=model,
                task="text-generation",
                torch_dtype=torch.bfloat16,
                device_map="auto",
            )

    def __call__(self, code_change: str, review: str):
        if self.model.startswith("gpt"):
            inst_to_be_judged = CODE_CHANGE_AND_REVIEW_JUDGE_PROMPT.format(
                # lang=LANG_MAP[lang], 
                code_change=code_change, 
                review=review
            )
            messages = [
                {"role": "system", "content": CODE_CHANGE_AND_REVIEW_SYSTEM_PROMPT},
                {"role": "user", "content": inst_to_be_judged}
            ]
            completion = self.client.chat.completions.create(
                model=self.model,
                messages=messages
            )
            response = str(completion.choices[0].message.content).strip()
        else:
            inst_to_be_judged = CODE_CHANGE_AND_REVIEW_JUDGE_PROMPT.format(
                # lang=LANG_MAP[lang], 
                code_change=code_change[:5000], 
                review=review
            )
            prompt = CODE_CHANGE_AND_REVIEW_SYSTEM_PROMPT + "\n" + inst_to_be_judged
            result = self.client(
                prompt, max_new_tokens=5, 
                num_return_sequences=1, temperature=0.0
            )
            response = result[0]["generated_text"].replace(prompt,'').split("\n")[0].strip()
            
        # Parse the score from the response
        if response.startswith("1"): score = 1
        elif response.startswith("2"): score = 2
        elif response.startswith("3"): score = 3
        elif response.startswith("4"): score = 4
        elif response.startswith("5"): score = 5
        else: score = 1

        return score/5, inst_to_be_judged


def score_code_reviews(
    code_diffs: List[str], 
    code_reviews: List[str], 
    # language_tags: List[str], 
    indices: Optional[List[int]] = None,
    system_names: Optional[List[str]] = None,
    model: str = "gpt-4o",
    api_key: Optional[str] = None,
    output_file: Optional[str] = None
) -> List[Dict[str, Any]]:
    """
    Evaluate the relevance of code reviews for given code differences.
    
    Args:
        code_diffs (List[str]): List of code changes/diffs
        code_reviews (List[str]): List of reviews to evaluate
        language_tags (List[str]): List of language tags ('py', 'js', 'java')
        indices (Optional[List[int]]): Optional indices for each sample
        system_names (Optional[List[str]]): Optional name of the system that generated each review
        model (str): Model to use as judge ('gpt-4o' or a HuggingFace model name)
        api_key (Optional[str]): API key for OpenAI (required for GPT models)
        output_file (Optional[str]): Path to save results, if specified
        
    Returns:
        List[Dict[str, Any]]: List of dictionaries containing evaluation results
    """
    if len(code_diffs) != len(code_reviews):
        raise ValueError("code_diffs, code_reviews must have the same length")
    
    # Initialize the judge
    judge = LLM_as_a_Judge(model=model, api_key=api_key)
    
    # Create indices and system names if not provided
    if indices is None:
        indices = list(range(len(code_diffs)))
    if system_names is None:
        system_names = ["model"] * len(code_diffs)
    
    # Prepare results
    results = []
    
    # Evaluate each review
    for i, (code_diff, review, idx, system) in enumerate(
        tqdm(zip(code_diffs, code_reviews, indices, system_names), 
             total=len(code_diffs))
    ):
        # Get score from judge
        score, prompt = judge(code_change=code_diff, review=review)
        
        # Create result record
        result = {
            "index": idx,
            "system": system,
            # "lang": lang,
            "diff": code_diff,
            "review": review,
            "score": score,
            "prompt": prompt
        }
        
        results.append(result)
        
        # Write to file if specified
        if output_file:
            with open(output_file, "a") as f:
                f.write(json.dumps(result) + "\n")
    
    return results


# Example usage
if __name__ == "__main__":

    diffs = df15['patch']
    # Example data
    # diffs = [
    #     "def add(a, b):\n    return a + b\n\n# Changed to\ndef add(a, b):\n    return a + b + 0  # Added unnecessary zero",
    #     "function greet(name) {\n    console.log('Hello ' + name);\n}\n\n// Changed to\nfunction greet(name) {\n    console.log(`Hello ${name}`);\n}"
    # ]
    
    # reviews = [
    #     "The function was modified to add zero to the result, which is unnecessary and might confuse readers.",
    #     "Changed string concatenation to template literals, which is a more modern approach."
    # ]

    reviews = df15['predicted_msg']
    
    # languages = ["py", "js"]
    
    # Make sure to set your API key if using GPT models
    # api_key = os.environ.get("OPENAI_API_KEY")
    api_key = "sk-proj-X3iU98JiVCRWgJgoW8MXT3BlbkFJv8cp8R3GpmTZm4M6e0lP"
    
    # Score the reviews
    results = score_code_reviews(
        code_diffs=diffs,
        code_reviews=reviews,
        # language_tags=languages,
        model="gpt-4o",  # or "ise-uiuc/Magicoder-S-DS-6.7B"
        api_key=api_key,
        output_file="review_scores_N_0.jsonl"
    )
    
    print(f"Evaluated {len(results)} reviews")
    for i, result in enumerate(results):
        print(f"Review {i+1} score: {result['score']:.2f}")

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [03:42<00:00,  2.23s/it]

Evaluated 100 reviews
Review 1 score: 0.80
Review 2 score: 0.80
Review 3 score: 0.20
Review 4 score: 0.80
Review 5 score: 0.80
Review 6 score: 0.40
Review 7 score: 0.80
Review 8 score: 0.80
Review 9 score: 0.80
Review 10 score: 0.80
Review 11 score: 0.40
Review 12 score: 0.80
Review 13 score: 0.60
Review 14 score: 0.80
Review 15 score: 0.40
Review 16 score: 0.80
Review 17 score: 0.20
Review 18 score: 0.80
Review 19 score: 0.80
Review 20 score: 0.80
Review 21 score: 0.60
Review 22 score: 0.20
Review 23 score: 0.80
Review 24 score: 0.60
Review 25 score: 0.80
Review 26 score: 0.80
Review 27 score: 0.80
Review 28 score: 1.00
Review 29 score: 0.40
Review 30 score: 0.80
Review 31 score: 0.80
Review 32 score: 0.80
Review 33 score: 0.60
Review 34 score: 0.80
Review 35 score: 0.80
Review 36 score: 0.60
Review 37 score: 0.80
Review 38 score: 0.80
Review 39 score: 0.80
Review 40 score: 0.80
Review 41 score: 0.80
Review 42 score: 0.60
Review 43 score: 0.80
Review 44 score: 0.60
Review 45 score: 0.




In [26]:
score15 = pd.read_json("/home/mkapadni/work/crscore_plus_plus/sft/checking_patch_length/review_scores_N_15.jsonl", lines=True)
score15.head()

Unnamed: 0,index,system,diff,review,score,prompt
0,0,model,"@@ -231,4 +231,8 @@ def setup_app(app):\n ...",The suggested code changes appear to be relate...,0.4,You will be asked to rate the relevance of rev...
1,1,model,"@@ -44,7 +44,7 @@ namespace OpenTelemetry.Trac...",The suggested code change adds SQL client inst...,0.4,You will be asked to rate the relevance of rev...
2,2,model,"@@ -25,7 +25,7 @@ from scapy.modules.six.moves...",The suggested code change introduces a new cla...,0.2,You will be asked to rate the relevance of rev...
3,3,model,"@@ -0,0 +1,4 @@\n+const titleNode = virtualNod...",The suggested code change is intended to find ...,0.8,You will be asked to rate the relevance of rev...
4,4,model,"@@ -37,6 +37,11 @@ public class EMailValidator...",The provided test cases for `EMailValidator` a...,0.4,You will be asked to rate the relevance of rev...


In [27]:
score0= pd.read_json("/home/mkapadni/work/crscore_plus_plus/sft/checking_patch_length/review_scores_N_0.jsonl", lines=True)
score5 = pd.read_json("/home/mkapadni/work/crscore_plus_plus/sft/checking_patch_length/review_scores_N_5.jsonl", lines=True)
score10 = pd.read_json("/home/mkapadni/work/crscore_plus_plus/sft/checking_patch_length/review_scores_N_10.jsonl", lines=True)
score50 = pd.read_json("/home/mkapadni/work/crscore_plus_plus/sft/checking_patch_length/review_scores_N_50.jsonl", lines=True)
score100 = pd.read_json("/home/mkapadni/work/crscore_plus_plus/sft/checking_patch_length/review_scores_N_100.jsonl", lines=True)
score200 = pd.read_json("/home/mkapadni/work/crscore_plus_plus/sft/checking_patch_length/review_scores_N_200.jsonl", lines=True)
score500 = pd.read_json("/home/mkapadni/work/crscore_plus_plus/sft/checking_patch_length/review_scores_N_500.jsonl", lines=True)
score1000 = pd.read_json("/home/mkapadni/work/crscore_plus_plus/sft/checking_patch_length/review_scores_N_1000.jsonl", lines=True)

In [28]:
# print avg of scores for all teh datasets
print("Average score for 0")
print(score0['score'].mean())
print("Average score for 5")
print(score5['score'].mean())
print("Average score for 10")
print(score10['score'].mean())
print("Average score for 15")
print(score15['score'].mean())
print("Average score for 50")
print(score50['score'].mean())
print("Average score for 100")
print(score100['score'].mean())
print("Average score for 200")
print(score200['score'].mean())
print("Average score for 500")
print(score500['score'].mean())
print("Average score for 1000")
print(score1000['score'].mean())


Average score for 0
0.7079999999999997
Average score for 5
0.54
Average score for 10
0.51
Average score for 15
0.4720000000000001
Average score for 50
0.442
Average score for 100
0.426
Average score for 200
0.41400000000000015
Average score for 500
0.4020000000000001
Average score for 1000
0.40776699029126223
