In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import torch
name = "bigcode/starcoder2-15b_16k"
model = AutoModelForCausalLM.from_pretrained(name, torch_dtype=torch.bfloat16).cuda()
tokenizer = AutoTokenizer.from_pretrained(name)

In [4]:
def make_prompt(ins, code, title="Changes to exercise1"):
    header = f"""<pr>Title: {title}
username_0: This PR resolves the following request.
{ins}<pr_status>opened<repo_name>code-editing/python-exercises<pr_base><pr_file>/problems/exercise1.py<pr_base_code>"""
    base_code = f"""<pr_base_code>{code}<pr_diff><pr_file>/problems/exercise1.py<pr_diff_hunk>"""
    prompt = header + base_code
    return prompt

code = """def add(x, y):
    return x + y"""

prompt = make_prompt("Add a function called `substract` that subtracts two numbers", code)
toks = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

In [5]:
import datasets
ds = datasets.load_dataset("nuprl/CanItEdit", split="test")

Downloading readme: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2.49k/2.49k [00:00<00:00, 21.0MB/s]
Downloading data: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 136k/136k [00:00<00:00, 483kB/s]
Generating test split: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 54/54 [00:00<00:00, 2626.52 examples/s]


In [6]:
# 43 is two hunks
ex = ds[22]
before = ex["before"]
ins = ex["instruction_descriptive"]
prompt = make_prompt(ins, before)
toks = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

In [7]:
outs = model.generate(toks, max_new_tokens=2000, temperature=0, top_p=0.95, do_sample=False)[0]
print(len(outs))
print(len(toks))
outs = outs[len(toks[0]):]
print(len(outs))
dec = tokenizer.decode(outs)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


2181
1
2000


In [8]:
pr_end_toks = [
    "<pr_review>", "<pr_comment>", "<pr>", "<pr_review_comment>"
]
# <pr_review>, <pr_comment>, <pr>


In [9]:
for tok in pr_end_toks:
    found = dec.find(tok)
    if found != -1:
        dec = dec[:found]
print(dec)

@@ -1,9 +1,18 @@
 import numpy as np
 
 class MarkovChain:
 
     def create_transition_matrix(self, matrix):
         
         matrix = np.array(matrix)
         column_sums = np.sum(matrix, axis=0)
         normalized_matrix = matrix / column_sums
-        return normalized_matrix.tolist()
+        return normalized_matrix.tolist() 
+
+    def translate_from_list(self, adj_list: Dict[int, List[int]]) -> List[List[float]]:
+        matrix = np.zeros((len(adj_list), len(adj_list)))
+        for i in adj_list:
+            for j in adj_list[i]:
+                matrix[i][j] = 1
+        column_sums = np.sum(matrix, axis=0)
+        normalized_matrix = matrix / column_sums
+        return normalized_matrix.tolist()



In [10]:
hunks = dec.split("<pr_diff_hunk>")

In [11]:
for hunk in hunks:
    print(hunk)

@@ -1,9 +1,18 @@
 import numpy as np
 
 class MarkovChain:
 
     def create_transition_matrix(self, matrix):
         
         matrix = np.array(matrix)
         column_sums = np.sum(matrix, axis=0)
         normalized_matrix = matrix / column_sums
-        return normalized_matrix.tolist()
+        return normalized_matrix.tolist() 
+
+    def translate_from_list(self, adj_list: Dict[int, List[int]]) -> List[List[float]]:
+        matrix = np.zeros((len(adj_list), len(adj_list)))
+        for i in adj_list:
+            for j in adj_list[i]:
+                matrix[i][j] = 1
+        column_sums = np.sum(matrix, axis=0)
+        normalized_matrix = matrix / column_sums
+        return normalized_matrix.tolist()



In [12]:
print(before)

import numpy as np

class MarkovChain:

    def create_transition_matrix(self, matrix):
        
        matrix = np.array(matrix)
        column_sums = np.sum(matrix, axis=0)
        normalized_matrix = matrix / column_sums
        return normalized_matrix.tolist()


In [None]:
prompt = f"""<issue_start>username_0: I have a program in Python that I'd like to change.

Here is the code for the program:
```py
def add(a, b):
    return a + b
```

Add a "sub" function that subtracts two numbers. Also write docstrings for both functions and change a,b to x,y.

Please someone help me. Can you also provide the full code with the change?<issue_comment>username_1: Sure, no problem. I will be able to help. I am an expert in editing Python code.

Here is the full code with the change:
```py
def add(x, y):
    \"\"\"Adds two numbers.\"\"\"
    return x + y

    def sub(x, y):
    \"\"\"Subtracts two numbers.\"\"\"
    return x - y
```
Upvotes: 200<issue_comment>username_0: Thank you so much! I have another program in Python that I'd like to change.

Here is the code for the program:
```py
{before}
```

{ins}

Please someone help me. Can you also provide the full code with the change?
Upvotes: 100<issue_comment>username_1: Sure, no problem. I will be able to help. I am an expert in editing Python code.

Here is the full code with the change:
```py"""

toks = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
gen = model.generate(toks, max_new_tokens=1200)[0]
dec = tokenizer.decode(gen[len(toks[0]):])
print(dec.split("```")[0])
# print(dec)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
