In [1]:
from transformers import GemmaTokenizer, AutoModelForCausalLM, AutoTokenizer
import torch

In [2]:
gpu = torch.device('cuda:0')

In [3]:
model_id = "google/codegemma-1.1-7b-it"
#tokenizer = GemmaTokenizer.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map=gpu, torch_dtype=torch.bfloat16)


Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu_pytorch_tanh`, edit the `model.config` to set `hidden_activation=gelu_pytorch_tanh`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [12]:

new_prompt = '''
Objective: Identify lines of code that might contain bugs.

Context: I have a codebase written in c++. I need to find lines that might contain bugs based on specific criteria.

Criteria for Identifying Potential Bugs:

    Syntax Errors: Look for lines with syntax errors.
    Logical Errors: Identify lines where the logic might be flawed (e.g., incorrect conditions, improper use of operators).
    Runtime Errors: Find lines that might cause runtime errors (e.g., null pointer dereferences, out-of-bound array access).
    Common Pitfalls: Check for common pitfalls in the language (e.g., off-by-one errors, improper resource management).
    Code Quality Issues: Look for lines that deviate from standard coding practices (e.g., poor variable naming, lack of comments, complex expressions).


Instructions:

    Analyze the provided codebase and identify lines that match the above criteria.
    Make sure to check for edge cases and scenarios where the input might cause unexpected behavior.
    Pay attention to lines with complex logic or multiple operations as they are more prone to errors.
    For each identified line, provide a json object with the following fields:
        problematic_code: A snippet of the problematic code. 
        description: A description of the potential bug.
        suggestion: A suggestion for how to fix or further investigate the issue.

Here is an example of how the code could look like and what your response should be:

Example1 - Code:
c++

double calculate_average(std::vector<int> numbers) {
    int total = std::accumulate(numbers.begin(), numbers.end(), 0);
    int count = numbers.size();
    return total / static_cast<double>(count);
}

int get_element(std::vector<int> array, int index) {
    return array.at(index);
}

Example1 - Response:

json

[{  problematic_code: "return total / static_cast<double>(count);",
    description: "Potential division by zero if numbers vector is empty",
    suggestion: "Add a check to ensure count is not zero before performing the division." },
    { problematic_code: "return array.at(index);",
    description: "Possible out-of-bound array access",
    suggestion: "Add a check to ensure the index is within the bounds of the array." }]

Please review the code below and use the formatting of the example to provide your response.

Code to analyze:

c++

#include <iostream>
#include <vector>
using namespace std;

int sum_of_elements(vector<int> arr) {
    int sum = 0;
    for (int i = 0; i <= arr.size(); i++) {
        sum += arr[i];
    }
    return sum;
}

int main() {
    int n;
    cout << "Enter the number of elements: ";
    cin >> n;

    vector<int> arr(n);
    cout << "Enter the elements:" << endl;
    for (int i = 0; i < n; i++) {
        cin >> arr[i];
    }

    int result = sum_of_elements();
    cout << "Sum of elements: " << result << endl;

    return 0;
}

'''

In [13]:


chat = [
    { "role": "user", "content": new_prompt},
]

chat_prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

In [14]:

prompt = '''\
You are debugging some code. Can you help me identify the issues in this piece of code? Can you give me for every line a probability if this line contains a bug:
def sumElementsOfList(in_list):
    sum = 3
    for i in list:
        sum -= i
    return sum
'''

prompt = """
Objective: The objective of this task is to identify potentially non-functional or suspicious code segments. Your task is to assign probabilities to each line of code based on how likely it is to be non-functional or contain errors.

Instructions:

1. Identify Non-Functional Code: Your primary goal is to identify code segments that are likely to be non-functional, contain errors, or deviate significantly from expected programming conventions.
   
2. Assign Probability Scores: For each code segment provided, assign a probability score ranging from 0 to 1, where 0 indicates high confidence that the code is functional and error-free, and 1 indicates high suspicion of non-functionality or errors.

3. Consider Context and Syntax: Take into account the context in which the code appears and the syntactical correctness. Code segments that exhibit unusual or unexpected patterns, syntax errors, or inconsistencies are more likely to be flagged as suspicious.

4. Highlight Uncommon Constructs: Pay particular attention to code constructs, functions, or methods that are rarely used or appear in unconventional contexts. These are often indicators of potential errors or unintended behavior.

The output format should look like this:
LOC    probability
eg.
1    0.2
2    0.1
3    0.9

Do this for the following code snippet:
```
def sumElementsOfList(in_list):
    sum = 3
    for i in list:
        sum -= i
    return sum
```
"""


#model.generate(**inputs,  max_new_tokens=200)

#new_prompt = '''Write code for fibonacci series in python'''
inputs = tokenizer(chat_prompt, return_tensors="pt").to(model.device)
# print number of prompt tokes
print(inputs["input_ids"].shape[-1])


#output = model.generate(**inputs, max_new_tokens=1000)
#print(tokenizer.decode(output[0]))
#prediction = output[0][-1]
#softmaxed = torch.softmax(prediction,0)
#print(tokenizer.convert_ids_to_tokens([torch.argmax(softmaxed)]))
#print(inputs)
prompt_len = inputs["input_ids"].shape[-1]
outputs = model.generate(**inputs, max_new_tokens=1000)
print(tokenizer.decode(outputs[0][prompt_len:]))

744
```json
[{
  "problematic_code": "int result = sum_of_elements();",
  "description": "Missing argument to the function call",
  "suggestion": "Pass the vector of elements to the function as an argument."
},
{
  "problematic_code": "for (int i = 0; i <= arr.size(); i++) {",
  "description": "Loop condition will result in accessing beyond the array bounds",
  "suggestion": "Change the condition to i < arr.size() to avoid out-of-bounds access."
}]
```<eos>


In [27]:

prompt = '''\
class HNSW {
'''

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
prompt_len = inputs["input_ids"].shape[-1]
outputs = model.generate(**inputs, max_new_tokens=2000)
print(tokenizer.decode(outputs[0]))

<bos>class HNSW {
    constructor(
        dimension,
        num_trees,
        num_neighbors,
        num_threads,
        max_memory_size,
        max_memory_usage,
        max_memory_usage_per_node,
        max_memory_usage_per_node_per_thread,
        max_memory_usage_per_node_per_thread_per_tree,
        max_memory_usage_per_node_per_thread_per_tree_per_thread,
        max_memory_usage_per_node_per_thread_per_tree_per_thread_per_tree,
        max_memory_usage_per_node_per_thread_per_tree_per_thread_per_tree_per_tree,
        max_memory_usage_per_node_per_thread_per_tree_per_thread_per_tree_per_tree_per_tree,
        max_memory_usage_per_node_per_thread_per_tree_per_thread_per_tree_per_tree_per_tree_per_tree,
        max_memory_usage_per_node_per_thread_per_tree_per_thread_per_tree_per_tree_per_tree_per_tree_per_tree,
        max_memory_usage_per_node_per_thread_per_tree_per_thread_per_tree_per_tree_per_tree_per_tree_per_tree_per_tree,
        max_memory_usage_per_node_per_thread_

In [28]:
print(tokenizer.tokenize("Sohn"))
print(tokenizer.tokenize(chr(0x1FFA0)))
print(tokenizer.tokenize("Merkel"))
print(tokenizer.tokenize("🍿"))
print(tokenizer.tokenize("🪠"))
print(tokenizer.tokenize("🤌"))


['S', 'ohn']
['<0xF0>', '<0x9F>', '<0xBE>', '<0xA0>']
['Mer', 'kel']
['🍿']
['<0xF0>', '<0x9F>', '<0xAA>', '<0xA0>']
['🤌']


In [13]:
print(len("🪠"))

1


In [19]:
prompt = '''\
//use hnswlib to build hnsw
'''

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
prompt_len = inputs["input_ids"].shape[-1]
outputs = model.generate(**inputs, max_new_tokens=2000)
print(tokenizer.decode(outputs[0]))

<bos>//use hnswlib to build hnsw
//use hnswlib to build hnsw
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <algorithm>
#include <cmath>
#include <chrono>
#include <random>
#include <unordered_map>
#include <unordered_set>
#include <set>
#include <map>
#include <queue>
#include <stack>
#include <list>
#include <deque>
#include <bitset>
#include <sstream>
#include <iomanip>
#include <numeric>
#include <functional>
#include <utility>
#include <tuple>
#include <climits>
#include <cfloat>
#include <cstdint>
#include <cstring>
#include <cstdlib>
#include <ctime>
#include <cassert>
#include <complex>
#include <valarray>
#include <array>
#include <random>
#include <limits>
#include <locale>
#include <codecvt>
#include <regex>
#include <filesystem>
#include <thread>
#include <mutex>
#include <condition_variable>
#include <future>
#include <atomic>
#include <chrono>
#include <random>
#include <unordered_map>
#include <unordered_set>
#include <set>
#include <