In [27]:
# Install dependencies
!pip install transformers torch torchvision torchaudio
!pip install accelerate
!pip install -U bitsandbytes  # For model optimization (we are gonna use 4 bit quantization)
!pip install datasets  # fine-tune later

# For better GPU utilization
!pip install flash-attn --no-build-isolation



In [25]:
# Check if we have GPU
import torch
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

GPU available: True
GPU: Tesla T4


In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_id = "Qwen/Qwen2.5-3B-Instruct"  # good balance for Colab; swap to 7B if GPU allows

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4",
                                bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16)

tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)

model = AutoModelForCausalLM.from_pretrained(
    model_id, device_map="auto", quantization_config=bnb_config, torch_dtype="auto"
)  # 4-bit auto placement

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [2]:
sample_input = {
    "code_snippet": """
    def get_active_users(users):
    results = []
    for u in users:
        if u.is_active == True and u.profile_complete == True:
            results.append(u)
    return results""",

    "review_comments": [
        "This is inefficient. Don't loop twice conceptually.",
        "Variable 'u' is a bad name.",
        "Boolean comparison '== True' is redundant."
    ]
}

print("📝 Sample data ready:")
print(f"Code: {sample_input['code_snippet'][:50]}...")
print(f"Comments: {len(sample_input['review_comments'])} comments")

📝 Sample data ready:
Code: 
    def get_active_users(users):
    results = []...
Comments: 3 comments


In [3]:
SYSTEM = (
  "Act as an empathetic senior developer mentor who rewrites blunt code review comments "
  "into supportive, educational guidance with clear 'why' and a concrete code example; "
  "respond ONLY with a single well-formatted Markdown report following the sections described."  # structure control
)

In [36]:
def generate_report(code_snippet: str, review_comments: list[str]) -> str:
    # Build one composite prompt so the model can reason holistically about tone and consistency
    user_content = f"""
    You are given a code snippet and a list of direct, critical review comments.

    Task:
    - For each original comment, produce a JSON object with the following keys:
      1) "original_comment"
      2) "positive_rephrasing"
      3) "why"
      4) "suggested_improvement" (concrete code)
      5) "references" (to articles pertaining to the code)

    Constraints:
    - Output a single JSON object containing a list of these objects.
    - Keep tone empathetic and specific.
    - Where relevant to Python, follow PEP 8 conventions (e.g., naming, boolean checks) and note the principle briefly.

    Code:
    {code_snippet}

    Review comments:
    {chr(10).join(f"- {{c}}" for c in review_comments)}
    """

    messages = [
        {"role": "system", "content": SYSTEM},
        {"role": "user", "content": user_content.strip()},
    ]

    text = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tok([text], return_tensors="pt").to(model.device)

    out_ids = model.generate(
        **inputs,
        max_new_tokens=800,
        temperature=0.2,     # low temp for factual, stable outputs
        top_p=0.9,
        do_sample=False,     # deterministic for reliability
        repetition_penalty=1.05,
    )
    gen = tok.batch_decode(out_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
    return gen

In [None]:
import json

# Quick sanity check
code_snippet = """def get_active_users(users):
    results = []
    for u in users:
        if u.is_active == True and u.profile_complete == True:
            results.append(u)
    return results
"""
review_comments = [
    "This is inefficient. Don't loop twice conceptually.",
    "Variable 'u' is a bad name.",
    "Boolean comparison '== True' is redundant.",
]
generated_reports_json_string = generate_report(code_snippet, review_comments)[0] # generate_report returns a list with one JSON string

In [39]:
print(generated_reports_json_string)

```json
[
  {
    "original_comment": "- The condition should be `u.is_active` instead of `u.is_active == True`.",
    "positive_rephrasing": "- Consider using `u.is_active` directly without the equality check.",
    "why": "- Using `u.is_active` directly is more concise and avoids unnecessary type conversion.",
    "suggested_improvement": "- Change the condition to: `if u.is_active and u.profile_complete:`",
    "references": "- [PEP 8: Naming Conventions](https://peps.python.org/pep-0008/#naming-conventions)"
  },
  {
    "original_comment": "- The condition should be `u.profile_complete` instead of `u.profile_complete == True`.",
    "positive_rephrasing": "- Use `u.profile_complete` directly without the equality check.",
    "why": "- Directly using `u.profile_complete` is more readable and avoids unnecessary type conversion.",
    "suggested_improvement": "- Change the condition to: `if u.is_active and u.profile_complete:`",
    "references": "- [PEP 8: Naming Conventions](https:

In [40]:
import json, re

def parse_json_report(s: str):
    # 1) Try direct parse
    try:
        return json.loads(s)
    except json.JSONDecodeError:
        pass

    # 2) Strip Markdown code fences like ``````
    s2 = re.sub(r"^\s*```json", "", s) # Added "" as replacement and s as string
    s2 = re.sub(r"\s*```\s*$", "", s2) # Added "" as replacement and s2 as string
    s2 = s2.strip()
    try:
        return json.loads(s2)
    except json.JSONDecodeError:
        pass

    # 3) Fallback: extract the largest JSON object
    start = s2.find("{")
    end = s2.rfind("}")
    if start != -1 and end != -1 and end > start:
        try:
            return json.loads(s2[start:end+1])
        except json.JSONDecodeError:
            pass


    # If still failing, raise the original error
    raise ValueError("Could not locate a valid JSON object in the string.")
generated_reports_data = parse_json_report(generated_reports_json_string)

In [43]:
print(generated_reports_data, sep='\n')

[{'original_comment': '- The condition should be `u.is_active` instead of `u.is_active == True`.', 'positive_rephrasing': '- Consider using `u.is_active` directly without the equality check.', 'why': '- Using `u.is_active` directly is more concise and avoids unnecessary type conversion.', 'suggested_improvement': '- Change the condition to: `if u.is_active and u.profile_complete:`', 'references': '- [PEP 8: Naming Conventions](https://peps.python.org/pep-0008/#naming-conventions)'}, {'original_comment': '- The condition should be `u.profile_complete` instead of `u.profile_complete == True`.', 'positive_rephrasing': '- Use `u.profile_complete` directly without the equality check.', 'why': '- Directly using `u.profile_complete` is more readable and avoids unnecessary type conversion.', 'suggested_improvement': '- Change the condition to: `if u.is_active and u.profile_complete:`', 'references': '- [PEP 8: Naming Conventions](https://peps.python.org/pep-0008/#naming-conventions)'}, {'origi

In [45]:
for report in generated_reports_data:
  print("Original Commment")
  print(report['original_comment'])
  print("positive rephrasing")
  print(report['positive_rephrasing'])
  print("Why")
  print(report['why'])
  print("Suggested Improvement")
  print(report['suggested_improvement'])
  print("references")
  print(report['references'])
  print("__________________________________________________")
  print()


Original Commment
- The condition should be `u.is_active` instead of `u.is_active == True`.
positive rephrasing
- Consider using `u.is_active` directly without the equality check.
Why
- Using `u.is_active` directly is more concise and avoids unnecessary type conversion.
Suggested Improvement
- Change the condition to: `if u.is_active and u.profile_complete:`
references
- [PEP 8: Naming Conventions](https://peps.python.org/pep-0008/#naming-conventions)
__________________________________________________

Original Commment
- The condition should be `u.profile_complete` instead of `u.profile_complete == True`.
positive rephrasing
- Use `u.profile_complete` directly without the equality check.
Why
- Directly using `u.profile_complete` is more readable and avoids unnecessary type conversion.
Suggested Improvement
- Change the condition to: `if u.is_active and u.profile_complete:`
references
- [PEP 8: Naming Conventions](https://peps.python.org/pep-0008/#naming-conventions)
__________________

In [50]:
from IPython.display import display, Markdown
import json

markdown_output = ""
for item in generated_reports_data: # Iterate directly over the list
    markdown_output += f"**Original Comment:** {item.get('original_comment', 'N/A')}\n\n"
    markdown_output += f"**Positive Rephrasing:** {item.get('positive_rephrasing', 'N/A')}\n\n"
    markdown_output += f"**Why:** {item.get('why', 'N/A')}\n\n"
    markdown_output += f"**Suggested Improvement:**\n```python\n{item.get('suggested_improvement', 'N/A')}\n```\n\n"
    markdown_output += f"**References:** {item.get('references', 'N/A')}\n\n---\n\n"
    markdown_output += f"\n\n"


display(Markdown(markdown_output))

**Original Comment:** - The condition should be `u.is_active` instead of `u.is_active == True`.

**Positive Rephrasing:** - Consider using `u.is_active` directly without the equality check.

**Why:** - Using `u.is_active` directly is more concise and avoids unnecessary type conversion.

**Suggested Improvement:**
```python
- Change the condition to: `if u.is_active and u.profile_complete:`
```

**References:** - [PEP 8: Naming Conventions](https://peps.python.org/pep-0008/#naming-conventions)

---



**Original Comment:** - The condition should be `u.profile_complete` instead of `u.profile_complete == True`.

**Positive Rephrasing:** - Use `u.profile_complete` directly without the equality check.

**Why:** - Directly using `u.profile_complete` is more readable and avoids unnecessary type conversion.

**Suggested Improvement:**
```python
- Change the condition to: `if u.is_active and u.profile_complete:`
```

**References:** - [PEP 8: Naming Conventions](https://peps.python.org/pep-0008/#naming-conventions)

---



**Original Comment:** - The function can be simplified by removing the redundant check.

**Positive Rephrasing:** - Simplify the function by removing the redundant check.

**Why:** - Removing the redundant check makes the logic clearer and reduces complexity.

**Suggested Improvement:**
```python
- Change the function to: `def get_active_users(users): results = [u for u in users if u.is_active and u.profile_complete] return results`
```

**References:** - [Python Enhancement Proposals (PEPs)](https://www.python.org/dev/peps/pep-0008/#id3)

---



