# Requirements

In [113]:
!pip install javalang

Collecting javalang
  Downloading javalang-0.13.0-py3-none-any.whl (22 kB)
Installing collected packages: javalang
Successfully installed javalang-0.13.0


In [1]:
import os
import hashlib
import ast       #Library to produce AST and analyze it
import astor     #Library to produce the actual code of an AST
import pandas as pd
#import javalang
#from javalang.tree import *

In [2]:
hash_dict = {}
filesname = []
file_directories = []

In [3]:
"""
In this part we make a list of the files' name and their own directory in the given directory.
A dictionary is also produced having a hash value of the files name as the key and their actual name as the value.
"""

directory = "/home/amirmahdi/projects/Adaptive-Logging-system-git/projects-to-investigate/allura-master/selected-files-to-investigate"
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    if os.path.isfile(f):
        filesname.append(filename)
        file_directories.append(f)

# Extracting function blocks from source code

In [4]:
"""
This function accepts the source code as a string. It first parse it to AST, then iterate through AST and selects 
nodes that are representing function definition (FunctionDef)
"""

def extract_functions(source_code):
    tree = ast.parse(source_code)
    functions = []
    for node in ast.walk(tree):
        if isinstance(node, ast.FunctionDef):
            functions.append(node)
    return functions

In [5]:
"""
Creating the func_of_files list which each of its elements it is a list of function nodes of a file in the
directory.
These nodes (subtree of the AST) could be used to creat the actual functions in string by using following code:

source_code = astor.to_source(node)

Also, in this part, an ID is assigned to each function (based on its name) by using hashlib, then this ID is used as 
the key value of the "hash_dict" which holds a list containing function's name, its directory and list of
neighbour functions as the dictionary's value.
"""

func_nodes_of_files = []
for file_dir in file_directories:
    with open(file_dir, "r") as file:
        source_code = file.read()
    functions = extract_functions(source_code)
    for function in functions:
        functions_copy = functions.copy() # Make a copy of list of functions found in current file
        functions_copy.remove(function)    #creating a list of neighbours of current function (in itteratin) by removing itself from functions_copy
        hash_dict[ hashlib.sha1(str.encode(function.name)).hexdigest() ] = [file_dir.split("/")[-1], file_dir, 
                                                                           functions_copy] 
    func_nodes_of_files.extend(functions)

In [63]:
func_nodes_of_files[0].name

'dummy_oauths'

In [7]:
source = astor.to_source(func_nodes_of_files[0])
print(source)

def dummy_oauths():
    from allura.controllers.rest import Oauth1Validator
    dummy_cons_tok = OAuthConsumerToken(api_key=Oauth1Validator().
        dummy_client, name='dummy client, for oauthlib implementation',
        user_id=None)
    session(dummy_cons_tok).flush(dummy_cons_tok)
    dummy_req_tok = OAuthRequestToken(api_key=Oauth1Validator().
        dummy_request_token, user_id=None, validation_pin='dummy-pin')
    session(dummy_req_tok).flush(dummy_req_tok)
    dummy_access_tok = OAuthAccessToken(api_key=Oauth1Validator().
        dummy_access_token, user_id=None)
    session(dummy_access_tok).flush(dummy_access_tok)



# Structring Dataset by extracting functions' features

## 1. Input-dependent loops

* #### Defining needed functions:

In [6]:
def loop_finder(function_node):
    loops = []
    for node in ast.walk(function_node):
        if isinstance(node, ast.For) or isinstance(node, ast.While):
            loops.append(node)
    return loops

In [7]:
"""This function takes a source code as input and finds the name of all variables which hold the result of 
a function call"""

def dependent_variable_finder(source_code):
    tree = ast.parse(source_code)             # creating source code's AST
    d_variable_names = []
    # here it funds variable holding function calls' result
    for node in ast.walk(tree):
        if isinstance(node, ast.Assign):
            for node_child in ast.walk(node):
                if isinstance(node_child, ast.Call):
                    # Parse back the variable assignment node to source code and extract variable names
                    d_variable_names.extend(astor.to_source(node).split("=")[0].split(",")) 
                    
    return d_variable_names

* #### Creating Dataframe,then adding "ID" and "number_of_loops" of a fuction to it:

In [38]:
df = pd.DataFrame(columns = ['ID', 'number_of_loops'])
#tmp =0
for function_node in func_nodes_of_files:
    found_loops = loop_finder(function_node)
    #tmp += len(found_loops)
    ID = hashlib.sha1(str.encode(function_node.name)).hexdigest()
    number_of_loops = len(found_loops)
    
    new_row = pd.Series({"ID":ID, "number_of_loops":number_of_loops})
    df = pd.concat([df, new_row.to_frame().T], ignore_index=True)
df = df.set_index("ID")
#df
#print(tmp)

* #### Adding "loop_input_dependent_level" to the dataframe:
    
    For all of the functions, first we extract all loops inside it (if there is any) by "using loop_finder" function. Then we creat a list of kewords representing input_dependent variables and neighbor functions. Doing these two steps, we then search for the keywords in loops' declaration part, and we increment the "input_dependent_level" of the given function if any keyword found.

In [23]:
"""This is a recursive function to find a nesting level of a given node representing a loop"""

def nested_loop_level_finder(loop_node):
    #nodes_list = ast.walk(loop_node)
    nodes_list = ast.iter_child_nodes(loop_node) # this returns a generator
    for child_node in nodes_list: # iterate over nodes_list which is a generator.
        if isinstance(child_node, ast.For) or isinstance(child_node, ast.While):
            return 1 + nested_loop_level_finder(child_node)
        elif child_node is not next(nodes_list, None): #checks if it is last item of the generator 
            return 0

In [36]:
"""This part is to test the nested_loop_level_finder function we defined above"""

s = "for loop in found_loops:\n for keyword in keywords:\n      if keyword in astor.to_source(loop):\n       input_dependent_level += 1"
tree = ast.parse(s)
nested_loop_level_finder(tree)

1

In [39]:
# this loops iterates over all of the extracted functions to find the input_dependent_level and nested_loop_level
for function_node in func_nodes_of_files:
    input_dependent_level = 0
    nested_loop_level = 0
    found_loops = loop_finder(function_node) # here we first find loops.
    if len(found_loops) != 0: #if any loop was found
        ID = hashlib.sha1(str.encode(function_node.name)).hexdigest()
        # use "has_dict" to get the neighbour functions of the curent function by using its ID (key):
        neighbour_functions_names = [n_function.name for n_function in hash_dict[ID][2]]
        
        with open(hash_dict[ID][1], "r") as file: # hash_dict[ID][1] gives the directory of current function's file
            source_code = file.read()
        dependent_variable_names = dependent_variable_finder(source_code)
        
        keywords = dependent_variable_names + neighbour_functions_names
        for loop in found_loops: # "loop" here is a node of a loop.
            for keyword in keywords:
                if keyword in astor.to_source(loop).split("\n")[0]:
                    input_dependent_level += 1
            
            nested_loop_level += nested_loop_level_finder(loop)
                
    df.at[ID, "nested_loop_level"] = int(nested_loop_level)
    df.at[ID, "loop_input_dependent_level"] = int(input_dependent_level)
# the functions without any loops inside will have Nan value for "loop_input_dependent_level", so we replace them:
df["loop_input_dependent_level"] = df["loop_input_dependent_level"].fillna(0)
df["nested_loop_level"] = df["loop_input_dependent_level"].fillna(0) 
#df

* #### Final version of the dataset having all features related to loops:
    The dataset so far includes folowing features:
    * **number_of_loops**: 
        It represents the number of loops (either for or while) inside each of the functions.
    * **nested_loop_level**: For the functions which have loop inside them, we check if those loop are nested or not; If a nested loop was found, we increse the nested_loop_level by 1. For example, if a function has the nesting level of 2, it means that this function either has a nested loop with depth of 2, or two nested loops with depth of 1.
    * **loop_input_dependent_level**: For this feature, we first creat a list of keywords which includes name of other functions (neighbor functions) and\or name of dependent variables (variables holding the return value of a function). Then for the functions which has loop(s), we check if any of these keywords has been used in the loop's declaration part or not. Each keyword match will cause an increment of 1 for loop_input_dependent_level.
    
  Next table shows the dataset so far:

In [40]:
df

Unnamed: 0_level_0,number_of_loops,nested_loop_level,loop_input_dependent_level
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ee2c971f20132061d7e749267b166730ff734a7c,0,0.0,0.0
d4f01649de54f7e70c368da6571ac5ccda9ddf79,0,0.0,0.0
bae8ac45f183613712be11aee11ec93b6e36e0d0,0,0.0,0.0
9aa1369d290e7732b7ca8db743f27d10c0ace093,0,0.0,0.0
9aa1369d290e7732b7ca8db743f27d10c0ace093,0,0.0,0.0
...,...,...,...
4553a4f2a833f635e06b21120fd95dbd7dc3fe09,0,0.0,0.0
05bb76f1e435db71754a65f9e8acec7dfdd4c65d,0,0.0,0.0
4fb170389e1b482ed3c7fae2967caca7629c84a6,0,0.0,0.0
ea00c9c71a474f84e3280537f04ede09ae70e44f,0,0.0,0.0


In [155]:
for i in df["loop_input_dependent_level"]:
    print(i)

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


## 2. Synchronization

# Garbage:

In [45]:
fors = [] #this should be dataframe instead
for function_node in func_nodes_of_files:
    found_fors = loop_finder(function_node)
    if len(found_fors) != 0:
        fors.extend(found_fors)
print(astor.to_source(for_nodes_of_files[0][0]))

for node in ([obj, target] + (related_nodes or [])):
    if isinstance(node, Project):
        create_timelines.post(node.node_id)



In [None]:
functions = []
for file_dir in file_directories:
    with open(file_dir, "r") as file:
        source_code = file.read()
    functions.extend(dependent_variable_finder(source_code))
print(len(functions))
print(astor.to_source(functions[10]))
print(functions[10].func.id)