# init

In [1]:
import pandas as pd
import numpy as np
from typing import List, Tuple, Dict, Any

seed = 18022004
np.random.seed(seed)

In [2]:
data_prefix: str = 'data'
repo_prefix: str = f'{data_prefix}/repos'

data_name = 'no_comment_dataset.parquet'

data_df: pd.DataFrame = pd.read_parquet(f'{data_prefix}/{data_name}', engine = 'pyarrow')

# init parser

In [3]:
from tree_sitter import Language, Parser
import tree_sitter_java as tsjava

# Initialize the parser and set the Java language
JAVA_LANGUAGE = Language(tsjava.language())
parser = Parser(JAVA_LANGUAGE)

In [4]:
def extract_methods_with_body(java_code):
    methods = []
    # print('java_code :',java_code)
    try:
        try:
            java_code = java_code.encode('utf-8')
        except Exception as e:
            print(e)
        tree = parser.parse(java_code)
        root_node = tree.root_node
        # Function to extract code from a node
        def extract_code(source_code, node):
            start_byte = node.start_byte
            end_byte = node.end_byte
            return source_code[start_byte:end_byte].decode("utf-8")

        # Traverse the syntax tree and find the method declaration
        for child in root_node.children:
            if child.type == "class_declaration":
                for class_child in child.children:
                    if class_child.type == "class_body":
                        for body_child in class_child.children:
                            if body_child.type == "method_declaration":
                                # Check for valid method declaration without errors
                                if not any(c.type == "ERROR" for c in body_child.children):
                                    method_name = ""
                                    method_signature = ""
                                    method_body = ""
                                    modifiers = []
                                    return_type = ""

                                    # Extract components of the method declaration
                                    for method_child in body_child.children:
                                        if method_child.type == "modifiers":
                                            modifiers = [extract_code(java_code, modifier) for modifier in method_child.children]
                                        elif method_child.type in ["type", "type_identifier", "scoped_type_identifier"]:  # Handle nested type nodes
                                            return_type = extract_code(java_code, method_child)
                                        elif method_child.type == "identifier":  # Capture method name
                                            method_name = extract_code(java_code, method_child)
                                        elif method_child.type == "formal_parameters":  # Capture parameters
                                            parameters = extract_code(java_code, method_child)
                                            method_signature = f"{' '.join(modifiers)} {return_type} {method_name}{parameters}"
                                    method_body = extract_code(java_code, body_child)
                                    methods.append({
                                        "name": method_name,
                                        "signature": method_signature.strip(),
                                        "body": method_body
                                    })
            else:
                # print('child.type :',child.type)
                pass

        return methods
    except Exception as e:
        print('Loi :' ,e)
        return None

# observe

In [43]:
edge_cases = data_df[data_df['startCode_cleaned'].str.contains('//')]

print(edge_cases.iloc[3]['startCode_cleaned'])

package org.mule.module.jersey;
public class RootServletTestCase extends AbstractServletTestCase 
{
    public RootServletTestCase() 
    {
        super("/*");
    }
    public void testBasic() throws Exception
    {
        testBasic("http://localhost:63088/base");
    }
    @Override
    protected String getConfigResources() 
    {
        return "servlet-conf.xml";
    }
}


In [46]:
errors = open(f'{data_prefix}/tmp.txt', 'r')
errors = [int(i.strip()) for i in errors.readlines()]

print(f'errors: {len(errors)}')

errors: 470


In [5]:
errors = [210]

In [15]:
correct_errors = []
for id in errors:
    try:
        row = data_df.iloc[id]
    except Exception as e:
        print(e)
        print('id :', id)
        continue

    start_code = row['startCode']
    end_code = row['endCode']
    start_code_cleaned = row['startCode_cleaned']
    end_code_cleaned = row['endCode_cleaned']

    parsed_methods_start = extract_methods_with_body(start_code)
    parsed_methods_end = extract_methods_with_body(end_code)

    parsed_methods_start_cleaned = extract_methods_with_body(start_code_cleaned)
    parsed_methods_end_cleaned = extract_methods_with_body(end_code_cleaned)

    try:
        if ((len(parsed_methods_start) != len(parsed_methods_start_cleaned)) or (len(parsed_methods_end) != len(parsed_methods_end_cleaned))):
            print('id :', id)
            print('parsed_methods_start :', parsed_methods_start)
            print('parsed_methods_end :', parsed_methods_end)
            print('parsed_methods_start_cleaned :', parsed_methods_start_cleaned)
            print('parsed_methods_end_cleaned :', parsed_methods_end_cleaned)
            print('----------------------------------------')

            print(f'start_code: {start_code}')
            print('-' * 50)
            # print(f'end_code: {end_code}')
            print(f'start_code_cleaned: {start_code_cleaned}')
            # print(f'end_code_cleaned: {end_code_cleaned}')
            print('-' * 50)
            correct_errors.append(id)

            break
    except Exception as e:
        print(e)
        print('id :', id)

child.type : block_comment
child.type : package_declaration
child.type : import_declaration
child.type : import_declaration
child.type : import_declaration
child.type : import_declaration
child.type : import_declaration
child.type : import_declaration
child.type : import_declaration
child.type : block_comment
child.type : block_comment
child.type : package_declaration
child.type : import_declaration
child.type : import_declaration
child.type : import_declaration
child.type : import_declaration
child.type : import_declaration
child.type : import_declaration
child.type : import_declaration
child.type : block_comment
child.type : package_declaration
child.type : import_declaration
child.type : import_declaration
child.type : import_declaration
child.type : import_declaration
child.type : import_declaration
child.type : import_declaration
child.type : import_declaration
child.type : package_declaration
child.type : import_declaration
child.type : import_declaration
child.type : import_decl

In [7]:
print(start_code)

/*
 * $Id$
 * --------------------------------------------------------------------------------------
 * Copyright (c) MuleSource, Inc.  All rights reserved.  http://www.mulesource.com
 *
 * The software in this package is published under the terms of the CPAL v1.0
 * license, a copy of which has been included with this distribution in the
 * LICENSE.txt file.
 */

package org.mule.routing;

import org.mule.impl.MuleEvent;
import org.mule.umo.UMOMessage;
import org.mule.umo.UMOSession;
import org.mule.umo.endpoint.UMOEndpoint;
import org.mule.umo.routing.RoutingException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * <code>LoggingCatchAllStrategy</code> is a simple strategy that only logs any
 * events not caught by the router associated with this strategy. This should <b>not</b>
 * be used in production unless it is acceptible for events to be disposing.
 */

public class LoggingCatchAllStrategy extends AbstractCatchAllStrategy
{
    priv