### GH Access Token

In [1]:
def load_token(filename):
    with open(filename, 'r') as f:
        token = f.read().strip()
    return token

GH_ACCESS_TOKEN=load_token("gh_access_token.txt")

1. Log in to your GitHub account.
2. Click on your profile picture in the top-right corner of the screen and select "Settings".
3. In the left sidebar, click on "Developer settings".
4. Click on "Personal access tokens".
5. Click on "Generate new token".
6. Give your token a description that will help you remember what it is used for.
7. Select the permissions that your token requires. Note that you should only select the permissions that are necessary for your application to function.
8. Click on "Generate token".
9. Copy the generated access token.
10. Open a text editor and create a new file called gh_access_token.txt.
11. Paste the access token into the file.
12. Save and close the file.


### Regular Imports

In [2]:
import sys
import ast
import git
import numpy as np
import random
from tqdm.auto import tqdm, trange
import os
from datetime import datetime
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor, as_completed, wait, ALL_COMPLETED
import networkx as nx
import tree_sitter
from tree_sitter import Language
import json
import requests
import pandas as pd
from collections import defaultdict
from github import Github
import base64

In [3]:
#Disable import warnings for tensorflow
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['OMP_NUM_THREADS'] = '3'

### Machine Learning Imports

In [4]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.layers import (
    Input, Dense, LSTM, Attention, Concatenate, Layer,
    Embedding, Dot, Softmax, TimeDistributed, Multiply,
    Lambda, LayerNormalization, MultiHeadAttention,
    Add, Masking, GlobalMaxPooling1D, GlobalMaxPooling2D, Reshape, MaxPooling1D, MaxPooling2D,
    Dropout, Conv1D, Conv2D, Bidirectional, GRU, ConvLSTM2D, Flatten, Permute
)

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import LambdaCallback, EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MultiLabelBinarizer
from sklearn.cluster import KMeans

from tensorflow.keras.models import Model
from tensorflow.keras.losses import MeanSquaredError, CosineSimilarity
from tensorflow.keras import backend as K
from keras.utils.vis_utils import plot_model
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans

### Data Pre-Processing Parameters
These must be set before running the data preprocessing
If any of these parameters are changed, all cells below **must** be re-run

In [5]:
#Number of contexts to pad/reduce to per sample
#This is the `k` number described in commit2seq
BAG_SIZE = 200

#Maximum depth of context
#This is the depth of the path itself, so with a depth of 32, paths can be a maximum of 32 notes deep.
#Smaller paths mean a higher focus on local dependencies, while longer paths represent a more general
#representation of long-distance dependencies.
CONTEXT_SIZE = 16

#Internal fixed length representation size. This is the size of the fixed-length vector that the encoder
#will learn to produce. Experimentation will be needed.
OUTPUT_SIZE = 2048

#Toggle to determine if tokens should be one-hot encoded or not
ONE_HOT = False

#Toogle to use root -> terminal paths instead of terminal -> terminal paths. 
#code2vec and commit2vec use terminal -> terminal paths, but generating them is significantly slower.
ONLY_ROOT_PATHS = False

#Toggle to drop paths above the context size limit. Likely should always be set to true.
IGNORE_DEEP_PATHS = True

#Generate full tuples of (terminal_token_a, (path...), terminal_token_b) instead of just paths.
#commit2seq describes using this strategy as being quite a bit more effective (like 30%), but currently I
#haven't figured out how to implement it.
USE_FULL_TUPLES = False    #NOT working yet

### Token Vocab and Meta Setup

In [6]:
#Load language files
#The ast.so file must be re-generated to add support for more languages.
#To do this, see: https://github.com/tree-sitter/py-tree-sitter

JS_LANGUAGE = Language('ast-bindings/build/ast.so', 'javascript')
PY_LANGUAGE = Language('ast-bindings/build/ast.so', 'python')
JAVA_LANGUAGE = Language('ast-bindings/build/ast.so', 'java')
C_LANGUAGE = Language('ast-bindings/build/ast.so', 'c')
CPP_LANGUAGE = Language('ast-bindings/build/ast.so', 'cpp')

In [7]:
def download_json_data(lang):
    url = f"https://raw.githubusercontent.com/tree-sitter/tree-sitter-{lang}/master/src/node-types.json"
    data = json.loads(requests.get(url).text)
    types = [node_type["type"] for node_type in data]
    for node_type in data:
        if "subtypes" in node_type:
            subtypes = [subtype["type"] for subtype in node_type["subtypes"]]
            types.extend(subtypes)
    types = list(set(types))
    return types

PYTHON_NODE_TYPES = download_json_data("python")
JAVA_NODE_TYPES = download_json_data("java")
JAVASCRIPT_NODE_TYPES = download_json_data("javascript")
C_NODE_TYPES = download_json_data("c")
CPP_NODE_TYPES = download_json_data("cpp")

In [8]:
#For now set to just use JAVA node types. To enable support for other languages, just use a set union.
ALL_NODE_TYPES = JAVA_NODE_TYPES
#ALL_NODE_TYPES = list(set(python_data + java_data + javascript_data + c_data + cpp_data))

# FILE_FILTERS = (".c", ".cpp", ".java", ".js", ".py")
FILE_FILTERS = (".java")

MAX_NODE_LOOKUP_NUM = len(ALL_NODE_TYPES)
ALL_NODE_INDEXES = range(MAX_NODE_LOOKUP_NUM)

In [9]:
#Automatically generated INPUT_SHAPE param. Don't modify
if(ONE_HOT):
    INPUT_SHAPE = (BAG_SIZE, CONTEXT_SIZE * (MAX_NODE_LOOKUP_NUM + 1))
else:
    INPUT_SHAPE = (BAG_SIZE, CONTEXT_SIZE)

### Load Raw dataset

In [10]:
RAW_DATA = pd.read_csv("pairs_apache_small.csv")

In [11]:
RAW_DATA

Unnamed: 0,fix_hash,bug_hash,Y,bug_repo,fix_repo
0,0b05d3b939c5ed37a4253e7c3614d824e76ed664,0b92cec1e07a1f2d9aa70f3ecd7d0fb12290d2e2,1,apache/kafka,apache/kafka
1,f84bf9cbe771f252d5624e30d27755c9e5225179,d46e5634a3bca248e00b4f44e5216cd6607c5a52,1,apache/hbase,apache/hbase
2,6c8f14b78afc0a6721433af6554b0ab45b8e163d,53989648276fc057d5ec7ab056a7d0a654d110b8,1,apache/hive,apache/hive
3,9623ae4de3e4c65ab2e90798843769795b8a00c6,505cf9d618c24f532e7b800d8b34a20a40c45feb,1,apache/kafka,apache/kafka
4,9fe8802cb83e05c0392b11b8dcfe354fecfda786,2e5c28f8c927f2caf6dcffc9070671089c5f771f,1,apache/hive,apache/hive
...,...,...,...,...,...
877,16c934812b2395da7fb3968965b03d2f6aa8c8a3,0a08525ad236f78df05c854dead62f300eae271d,0,apache/cassandra,apache/cassandra
878,b04bed022aec0f1f478a03383ab5184f048133b6,44f6c4b946511ce4663d41bf40f2960d2faee198,0,apache/kafka,apache/kafka
879,b04bed022aec0f1f478a03383ab5184f048133b6,0d4cf64af359d22749f7e865c4efaee773d64962,0,apache/kafka,apache/kafka
880,925599c72bc35edb5ffa25c6cc839932573b01aa,a8a73362bd9a3967c46011ded1ed831a586acd2e,0,apache/zookeeper,apache/zookeeper


### Load Rich Dataset from GH/git

In [13]:
repos = set(list(RAW_DATA["bug_repo"]) + list(RAW_DATA["fix_repo"]))
#Create a "clones" directory in order to clone local repos
if not os.path.exists("clones"):
    os.makedirs("clones")

# Clone each "repo" into the "clones" folder
for repo in tqdm(repos):
    repo_folder = repo.replace("/", "-")
    if not os.path.exists(f"clones/{repo_folder}"):
        os.system(f"git clone https://github.com/{repo}.git clones/{repo_folder}")

  0%|          | 0/13 [00:00<?, ?it/s]

In [14]:
class Commit:
    
    ###################################### SETUP ###################################### 
    
    def __init__(self, sha, repo_path):
        self.sha = sha
        self.repo_path = repo_path
        self.parent = None
        self.author = None
        self.datetime = None
        self.message = None
        self.bag_of_contexts = None
        self._populate_commit_info()

    def _populate_commit_info(self):
        repo = git.Repo(self.repo_path)
        commit = repo.commit(self.sha)
        self.parent = commit.parents[0].hexsha if commit.parents else None
        self.author = commit.author.name if commit.author else None
        self.datetime = datetime.fromtimestamp(commit.committed_date)
        self.message = commit.message if commit.message else None
    
    def _GH_populate_commit_info(self):
        g = Github(GH_ACCESS_TOKEN)
        repo = g.get_repo(self.repo_name)
        commit = repo.get_commit(sha=self.sha)
        self.parent = commit.parents[0].sha if commit.parents else None
        self.author = commit.author.name if commit.author else None
        self.datetime = commit.commit.author.date
        self.message = commit.commit.message if commit.commit.message else None
        
    def _generate_bags_of_contexts(self):
        self.bag_of_contexts = self.to_padded_consumable_data()

    ###################################### GIT and GitHub ###################################### 
        
    ################# GIT #################
    
    def get_files_changed(self):
        try:
            repo = git.Repo(self.repo_path)
            commit = repo.commit(self.sha)
            return [diff.a_path for diff in commit.diff(commit.parents[0]) if diff.a_path.endswith(FILE_FILTERS)]
        except:
            return []

    def get_source_at_commit(self, file_name):
        try:
            repo = git.Repo(self.repo_path)
            commit = repo.commit(self.sha)
            return commit.tree[file_name].data_stream.read().decode('utf-8')
        except:
            return ''

    def get_source_at_parent(self, file_name):
        try:
            repo = git.Repo(self.repo_path)
            commit = repo.commit(self.sha)
            return commit.parents[0].tree[file_name].data_stream.read().decode('utf-8')
        except:
            return ''
    
    ################# GitHub #################
    
    def gh_get_files_changed(self):
        try:
            g = Github(GH_ACCESS_TOKEN)
            repo = g.get_repo(self.repo_name)
            commit = repo.get_commit(sha=self.sha)
            return [f.filename for f in commit.files if f.filename.endswith(FILE_FILTERS)]
        except:
            return []

    def gh_get_source_at_commit(self, file_name):
        try:
            g = Github(GH_ACCESS_TOKEN)
            repo = g.get_repo(self.repo_name)
            contents = repo.get_contents(file_name, ref=self.sha)
            return base64.b64decode(contents.content).decode('utf-8')
        except:
            return ''

    def gh_get_source_at_parent(self, file_name):
        try:
            g = Github(GH_ACCESS_TOKEN)
            repo = g.get_repo(self.repo_name)
            commit = repo.get_commit(sha=self.sha)
            parent_commit = repo.get_commit(sha=commit.parents[0].sha)
            contents = repo.get_contents(file_name, ref=parent_commit.sha)
            return base64.b64decode(contents.content).decode('utf-8')
        except:
            return ''

    ###################################### AST Processing ###################################### 

    def source_to_ast(self, source, file_name):
        try:
            parser = tree_sitter.Parser()
            if file_name.endswith('.c'):
                parser.set_language(C_LANGUAGE)
            elif file_name.endswith('.cpp'):
                parser.set_language(CPP_LANGUAGE)
            elif file_name.endswith('.java'):
                parser.set_language(JAVA_LANGUAGE)
            elif file_name.endswith('.js'):
                parser.set_language(JS_LANGUAGE)
            elif file_name.endswith('.py'):
                parser.set_language(PY_LANGUAGE)
            else:
                print("UNKNOWN LANGUAGE")
                return None
            return parser.parse(bytes(source, 'utf8'))
        except:
            return None
        
    ###################################### Bag of Contexts Processing ###################################### 
        
    def get_root_paths(self, tree):
        try:
            paths = set()

            # Recursive function to explore the tree
            def explore(node, path, terminalA=None):
                # Add current node to path
                if terminalA is None:
                    terminalA = node
                path.append(node.type)

                # If the node has no children, it's a leaf node and the path is complete
                if not node.child_count:
                    if USE_FULL_TUPLES:
                        paths.add((terminalA.text, tuple(path), node.text))
                    else:
                        paths.add(tuple(path))
                else:
                    # Explore each child node recursively
                    for child in node.children:
                        explore(child, path, terminalA)

                # Remove current node from path before returning
                path.pop()

            # Start exploring from the root node
            root_node = tree.root_node
            explore(root_node, [])
            
            return paths
        except:
            return set([])

    def get_paths(self, ast):
        graph = nx.Graph()
        node_id = 0
        leaf_nodes = []

        def add_node(node):
            nonlocal node_id
            node_name = node.type
            node_id += 1
            graph.add_node(node_id, name=node_name, node=node, text=node.text)
            if not node.children:
                leaf_nodes.append(node_id)
            return node_id

        def add_edge(parent_id, child_id):
            graph.add_edge(parent_id, child_id)

        def traverse(node, parent_id=None):
            current_id = add_node(node)
            if parent_id is not None:
                add_edge(parent_id, current_id)
            for child in node.children:
                traverse(child, current_id)

        traverse(ast.root_node)

        if BAG_SIZE != None:
            leaf_nodes_sample = random.sample(leaf_nodes, min(BAG_SIZE, len(leaf_nodes)))
        else:
            leaf_nodes_sample = leaf_nodes
        leaf_node_pairs = [(leaf_nodes_sample[i], leaf_nodes_sample[j]) for i in range(len(leaf_nodes_sample)) for j in range(i+1, len(leaf_nodes_sample))]
        
        if BAG_SIZE != None:
            leaf_node_pairs = random.sample(leaf_node_pairs, min(BAG_SIZE,len(leaf_node_pairs)))
        
        all_paths = []
        for pair in leaf_node_pairs:
            path = list(nx.all_simple_paths(graph, source=pair[0], target=pair[1]))[0]
            node_types = [graph.nodes[nodeID]['name'] for nodeID in path]
            if(USE_FULL_TUPLES):
                all_paths.append((graph.nodes[pair[0]]['text'], tuple(node_types), graph.nodes[pair[1]]['text']))
            else:
                all_paths.append(tuple(node_types))
                
        return set(all_paths)


    def ast_to_bag_of_contexts(self, ast_trees):
        paths = set()
        for tree in ast_trees:
            if(ONLY_ROOT_PATHS):
                paths |= self.get_root_paths(tree)
            else:
                paths |= self.get_paths(tree)
        return paths

    ###################################### Padding and Encoding ###################################### 
    
    def map_bag_of_contexts_to_id(self, bag_of_contexts):
        mapped_paths = []
        for path in bag_of_contexts:
            mapped_path = []
            for node in path:
                index = ALL_NODE_TYPES.index(node)
                mapped_path.append(index + 1)
            mapped_paths.append(mapped_path)
        return mapped_paths

    def one_hot_encode(self, bag_of_contexts):
        one_hot_paths = []

        # Iterate over each row in the array
        for row in bag_of_contexts:
            # Create an empty list to hold the one-hot encodings for this row
            row_one_hot = []

            # Iterate over each element in the row
            for num in row:
                # Create an empty list to hold the one-hot encoding for this number
                num_one_hot = [0] * (MAX_NODE_LOOKUP_NUM + 1)

                # Set the corresponding element to 1
                num_one_hot[int(num)] = 1

                # Add the one-hot encoding for this number to the row's list
                row_one_hot.append(num_one_hot)

            # Add the row's list of one-hot encodings to the main list
            one_hot_paths.append(row_one_hot)

        return one_hot_paths

    def pad_each_context(self, bag_of_contexts):
        padded_one_hot_paths = []
        for path in bag_of_contexts:
            if IGNORE_DEEP_PATHS and len(path) > CONTEXT_SIZE:
                continue
            if ONE_HOT:
                padded_path = [[0] * (MAX_NODE_LOOKUP_NUM + 1)] * max(CONTEXT_SIZE - len(path), 0) + path[-CONTEXT_SIZE:]
            else:
                padded_path = [0] * max(CONTEXT_SIZE - len(path), 0) + path[-CONTEXT_SIZE:]
            padded_one_hot_paths.append(padded_path)
        return padded_one_hot_paths

    ###################################### Utility ###################################### 
    
    def to_raw_consumable_data(self):
        files_changed = self.get_files_changed()
        sources_at_commit = [self.get_source_at_commit(filename) for filename in files_changed]
        sources_at_parent = [self.get_source_at_parent(filename) for filename in files_changed]        

        asts_commit = [self.source_to_ast(source, files_changed[i]) for i, source in enumerate(sources_at_commit)]
        asts_parent = [self.source_to_ast(source, files_changed[i]) for i, source in enumerate(sources_at_parent)]
        
        contexts_commit = self.ast_to_bag_of_contexts(asts_commit)
        contexts_parent = self.ast_to_bag_of_contexts(asts_parent)

        contexts = list(contexts_commit.symmetric_difference(contexts_parent))

        
        contexts = self.map_bag_of_contexts_to_id(contexts)

        if(ONE_HOT):
            contexts = self.one_hot_encode(contexts)

        contexts = self.pad_each_context(contexts)

        return contexts
    
    def raw_to_padded(self, consumable):
        if(len(consumable) == BAG_SIZE):
            return consumable
        
        if(len(consumable) > BAG_SIZE):
            return random.sample(consumable, BAG_SIZE)
        
        if ONE_HOT:
            blank_path = [[0] * (MAX_NODE_LOOKUP_NUM + 1)] * CONTEXT_SIZE
        else:
            blank_path = ([0] * CONTEXT_SIZE)
            
        return ([blank_path] * (BAG_SIZE - len(consumable)) + consumable)

    
    def to_padded_consumable_data(self):
        consumable = self.to_raw_consumable_data()
        padded = self.raw_to_padded(consumable)
        if(ONE_HOT):
            padded = np.array(padded)
            padded = padded.reshape(padded.shape[0], padded.shape[1] * padded.shape[2])
            padded = padded.tolist()
        return padded

In [15]:
TUPLES = [(row[0], row[4]) for row in RAW_DATA.itertuples(index=False)] + [(row[1], row[3]) for row in RAW_DATA.itertuples(index=False)]
TUPLES = list(set(TUPLES))

In [None]:
#Creates a lookup dictionary where any commit SHA can be looked up to grab the Commit object with all the data, + bag of paths
COMMIT_DATA_LOOKUP = defaultdict(list)

def _to_commit(pair):
    sha = pair[0]
    repo = pair[1]
    commit = Commit(sha, f"clones/{repo.replace('/', '-')}")
    commit._populate_commit_info()
    commit._generate_bags_of_contexts()
    
    return commit

resolved = []
with ThreadPoolExecutor(max_workers=16) as executor:
    futures = [executor.submit(_to_commit, pair) for pair in TUPLES]
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing rich data from ommits"):
        resolved.append(future.result())

for commit in tqdm(resolved, desc="Adding to lookup dictionary"):
    if result is not None:
        COMMIT_DATA_LOOKUP[commit.sha] = commit

Processing rich data from ommits:   0%|          | 0/763 [00:00<?, ?it/s]

In [136]:
#TODO: Preprocess each commit metadata

In [28]:
#TODO: Create X_Train and X_Test

In [29]:
#TODO: Declare params for ML Model

In [30]:
#TODO: Initialize Model

In [31]:
#TODO: Train SimSiam Encoder

In [32]:
#TODO: Train binary classification

In [33]:
#TODO: Evaluate