# ANOMALIST TEST

## Dependencies

In [None]:
!python --version

In [None]:
!pip install GitPython
!pip install transformers
!pip install tokenizers
!pip install pickle5
!pip install tree-sitter

In [None]:
#!rm *.*

In [None]:
!ls -la

In [None]:
from google.colab import files
files.upload()

In [None]:
files.upload()

In [1]:
import json
from pathlib import Path
from tree_sitter import Language, Parser


import utils
from code_item import FragmentType
from code_corpus import CodeCorpus
from representator import Representator
from code_representation import CodeRepresentation
from anomalist import Anomalist



## Settings

In [2]:
aux_size = 78
hidden_size = 8
scaler_path = Path('anomalist_data/scaler_214_995415.pickle')
assert scaler_path.is_file()
vae_path = Path('anomalist_data/vae_214_995415.pth')
assert vae_path.is_file()
device_id = 'cpu'
verbose = False
paths = ['sample_working_but_syntactically_incorrect.py']
fragment_type = FragmentType.FUNCTION
model_name = 'microsoft/graphcodebert-base'

## Init

In [3]:
#tree_sitter_lib = Path('./langs_py_java_csharp.so')
#assert tree_sitter_lib.is_file()

parser = Parser()
parser.set_language(Language('D://tree-sitter-python//build//my-languages.so', 'python'))

In [4]:
representator = Representator(model_name=model_name, device_id=device_id)

In [5]:
anomalist = Anomalist(aux_size=aux_size, hidden_size=hidden_size, scaler_path=scaler_path, vae_path=vae_path, device_id=device_id, verbose=verbose)

## Inference

In [6]:
def run(anomalist, representator, paths):    
    corpus = CodeCorpus(lang='py', paths=paths, with_path=True)

    data = []
    for code_item, path in corpus:
        try:
            fragments = code_item.get_fragments(
                fragment_type=fragment_type,
                parser=parser,
                window_size=None,
                overlap=None)

        except Exception as e:
            print(f'ERROR in parsing: {path}, message: {e}')
            continue

        if not fragments:
            continue

        for fragment_index, fragment in enumerate(fragments):
            text = fragment.get_text()
            try:
                r, b = representator.run([text], padding=True, truncation=True)
                data.append((path, fragment_index, fragment, r, b))

            except Exception as e:
                print(f'ERROR in model: {path}, message: {e}')

    result = []
    for path, fragment_index, fragment, r, b in data:
        reconstruction, reconstruction_loss = anomalist.run_repr(r)
        loss = reconstruction_loss[0][0]
        result.append((path, fragment_index, fragment, loss))

    return result

In [7]:
def result_to_json_string(result):
    res = []
    for path, fragment_index, fragment, loss in result:
        item = {'a-index': f'{loss:.2f}',
                'path': path,
                'first_line': fragment.lines()[0],
                'fragment_range': {
                    'beg': fragment.beg(),
                    'end': fragment.end()
                },
                'fragment_index': fragment_index}
        res.append(item)
    return json.dumps(res)

In [8]:
result = run(anomalist, representator, paths)
print(result_to_json_string(result))

[{"a-index": "87.63", "path": "sample_working_but_syntactically_incorrect.py", "first_line": "def func(lines):", "fragment_range": {"beg": 0, "end": 5}, "fragment_index": 0}]


In [9]:
for path, fragment_index, fragment, loss in result:
    print(f'{path} {fragment.beg()}:{fragment.end()}  {fragment_index} {loss:.2f} {fragment.lines()[0]}')

sample_working_but_syntactically_incorrect.py 0:5  0 87.63 def func(lines):
