# Model Testing

### Imports

In [53]:
import sys
import ast
import git
import numpy as np
import random
from tqdm import tqdm
import os

In [22]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['KMP_BLOCKTIME'] = '1'

In [26]:
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LSTM, Attention, Concatenate, Layer, Embedding, Dot, Softmax, TimeDistributed
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.callbacks import LambdaCallback
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MultiLabelBinarizer

### Setup of metadata

In [37]:
ALL_NODE_TYPES = []
ALL_NODE_INDEXES = []
i = 0
for name in dir(ast):
    if not name.startswith('_'):
        attr = getattr(ast, name)
        if isinstance(attr, type) and issubclass(attr, ast.AST):
            ALL_NODE_TYPES.append(name)
            ALL_NODE_INDEXES.append(i)
            i += 1

MAX_NODE_LOOKUP_NUM = len(ALL_NODE_TYPES)

### Helper Functions

In [28]:
def process_commit(commit_sha, repo_path, onehot=True, padded=True):
    def get_file_contents(commit, file_path):
        return commit.tree[file_path].data_stream.read().decode('utf-8')

    def get_ast(contents):
        return ast.parse(contents)

    def get_paths(tree):
        paths = set()

        # Recursive function to explore the tree
        def explore(node, path):
            # Add current node to path
            path.append(type(node).__name__)

            # If the node has no children, it's a leaf node and the path is complete
            if not list(ast.iter_child_nodes(node)):
                paths.add(tuple(path))
            else:
                # Explore each child node recursively
                for child in ast.iter_child_nodes(node):
                    explore(child, path)

            # Remove current node from path before returning
            path.pop()

        # Start exploring from the root node
        root = ast.parse("")
        explore(tree, [])

        return paths

    ########################################## Processing and setup ##########################################

    repo = git.Repo(repo_path)
    commit = repo.commit(commit_sha)
    changed_py_files = [diff.a_path for diff in commit.diff(commit.parents[0]) if diff.a_path.endswith('.py')]

    ########################################## Compute Abstract Syntax Trees ##########################################

    pre_commit_trees = []
    post_commit_trees = []

    for file_path in changed_py_files:
        # print(commit.parents[0], file_path)
        try:
            pre_commit_contents = get_file_contents(commit.parents[0], file_path)
            pre_commit_tree = get_ast(pre_commit_contents)
            pre_commit_trees.append(pre_commit_tree)
            post_commit_contents = get_file_contents(commit, file_path)
            post_commit_tree = get_ast(post_commit_contents)
            post_commit_trees.append(post_commit_tree)
        except:
            pass

    ########################################## Compute Bag of Contexts (paths) ##########################################


    pre_commit_paths = set()
    for tree in pre_commit_trees:
        pre_commit_paths |= get_paths(tree)

    post_commit_paths = set()
    for tree in post_commit_trees:
        post_commit_paths |= get_paths(tree)

    unique_paths = pre_commit_paths.symmetric_difference(post_commit_paths)

    ########################################## Map Symbols to a number/index/id ##########################################

    mapped_paths = []
    for path in unique_paths:
        mapped_path = []
        for node in path:
            index = ALL_NODE_TYPES.index(node)
            mapped_path.append(index + 1)
        mapped_paths.append(mapped_path)

    ########################################## Convert to One-Hot Encoding ##########################################

    if onehot:
        one_hot_paths = []

        # Iterate over each row in the array
        for row in mapped_paths:
            # Create an empty list to hold the one-hot encodings for this row
            row_one_hot = []

            # Iterate over each element in the row
            for num in row:
                # Create an empty list to hold the one-hot encoding for this number
                num_one_hot = [0] * (MAX_NODE_LOOKUP_NUM+1)

                # Set the corresponding element to 1
                num_one_hot[int(num)] = 1

                # Add the one-hot encoding for this number to the row's list
                row_one_hot.append(num_one_hot)

            # Add the row's list of one-hot encodings to the main list
            one_hot_paths.append(row_one_hot)
            
            mapped_paths = one_hot_paths


    ########################################## Pad to a fixed length ##########################################

    if padded:
    
        padded_one_hot_paths = []

        SET_PATH_LENGTH = 32

        for path in mapped_paths:
            if onehot:
                padded_path = [[0] * (MAX_NODE_LOOKUP_NUM+1)] * max(SET_PATH_LENGTH - len(path), 0) + path[-SET_PATH_LENGTH:]
            else:
                padded_path = [0] * max(SET_PATH_LENGTH - len(path), 0) + path[-SET_PATH_LENGTH:]
            padded_one_hot_paths.append(padded_path)
    
        return padded_one_hot_paths
    else:
        return mapped_paths

In [29]:
def get_random_commit_shas(directory, n):
    # Open the Git repository
    repo = git.Repo(directory)

    # Get the SHA of the latest commit on the main branch
    latest_commit_sha = repo.head.commit.hexsha

    # Get the SHA of the first commit on the main branch
    first_commit_sha = repo.git.rev_list('--max-parents=0', 'main').splitlines()[0]

    # Create a list of all the commit SHAs on the main branch
    all_commit_shas = [commit.hexsha for commit in repo.iter_commits('main')]

    # Exclude the most recent and first ever commits from the list of possible random commit SHAs
    possible_commit_shas = [sha for sha in all_commit_shas if sha != latest_commit_sha and sha != first_commit_sha]

    # Create an array to store the commit shas
    commit_shas = []

    # Loop n times to generate n random commit shas
    for i in range(n):
        # Generate a random index within the range of possible commit SHAs
        random_index = random.randint(0, len(possible_commit_shas) - 1)

        # Get the commit SHA at the random index
        commit_sha = possible_commit_shas[random_index]

        # Append the commit SHA to the array
        commit_shas.append(commit_sha)

        # Remove the chosen commit SHA from the list of possible commit SHAs to avoid duplicates
        possible_commit_shas.remove(commit_sha)

    # Return the array of commit shas
    return commit_shas


In [51]:

def create_dataset(path, size=32, threshold=50, onehot=True):
    commits = get_random_commit_shas(path, size)
    X_train = []
    for commit in tqdm(commits, desc='Processing commits'):
        X_train.append(process_commit(commit, path, onehot))
    X_train = [data for data in X_train if len(data) > 0]
    y_train = [(1 if len(x) > threshold else 0) for x in X_train]

    # Determine the input and output dimensions
    try:
        input_dim = len(X_train[0][0])
    except:
        input_dim = 1        
    try:
        output_dim = len(X_train[0][0][0])
    except:
        output_dim = 1        

    X = []

    for num in ALL_NODE_INDEXES:
        num_one_hot = [0] * (MAX_NODE_LOOKUP_NUM+1)
        num_one_hot[int(num)] = 1
        if onehot:
            X.append(num_one_hot)
        else:
            X.append(num)

    P = [item for sublist in X_train for item in sublist]
    P = [arr for i, arr in enumerate(tqdm(P, desc="Generating path vocab")) if arr not in P[:i]]

    Y = [0,1]

    d=150

    return X_train, y_train, X, P, d, Y, input_dim, output_dim


### Data Configuration and Generation

In [52]:
#Set this to any git repo on your local
REPO_PATH = "/home/brennan/bot-radio-tempName"
DATA_SIZE = 32

X_train, y_train, X, P, d, Y, _, _ = create_dataset(REPO_PATH, DATA_SIZE)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

Processing commits: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:02<00:00, 12.64it/s]
Generating path vocab: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 839/839 [00:00<00:00, 855.72it/s]


In [55]:
path_vocab = np.random.randn(len(P), d)
value_vocab = np.random.randn(len(X), d)
W = np.random.randn(d, 3*d)
attention_vector = np.random.randn(d)

In [54]:
def compute_attention_code_vector(path_contexts):
    
    # Map each path-context to its corresponding embedding
    context_vectors = []
    for path_context in path_contexts:
        pj = path_context
        xs = path_context[0]
        xt = path_context[-1]

        xs_embedding = value_vocab[xs]
        pj_embedding = path_vocab[P.index(pj)]
        xt_embedding = value_vocab[xs] 

        context_vector = tf.concat([xs_embedding, pj_embedding, xt_embedding], axis=0)
        context_vectors.append(context_vector)

    # Combine context vectors using fully connected layer
    context_weights = tf.matmul(context_vectors, tf.transpose(W))
    combined_context_vectors = tf.nn.tanh(context_weights)

    # Compute attention weights
    attention_weights = tf.nn.softmax(tf.matmul(combined_context_vectors, tf.expand_dims(attention_vector, axis=1)), axis=0)
    
    # Aggregate into code vector using attention
    code_vector = tf.reduce_sum(tf.multiply(combined_context_vectors, attention_weights), axis=0)
    
    return code_vector