# init data

In [1]:
import pandas as pd
import numpy as np
from typing import List, Tuple, Dict, Any

seed = 18022004
np.random.seed(seed)

In [2]:
data_prefix: str = 'data'
repo_prefix: str = f'{data_prefix}/repos'

# data_name_template: str = '500_{}_sample_dataset.parquet'
# data_types = ['log', 'test', 'others']

# sampled_data_name: Dict[str, str] = {data_type: data_name_template.format(data_type) for data_type in data_types}

# log_df: pd.DataFrame = pd.read_parquet(f'{data_prefix}/{sampled_data_name['log']}', engine = 'pyarrow')
# test_df: pd.DataFrame = pd.read_parquet(f'{data_prefix}/{sampled_data_name['test']}', engine = 'pyarrow')
# others_df: pd.DataFrame = pd.read_parquet(f'{data_prefix}/{sampled_data_name['others']}', engine = 'pyarrow')

# data_name = 'migration_others_method_no_code.parquet'
data_name = 'migration_others_class_code_no_import.parquet'

# load data

In [3]:
data_df: pd.DataFrame = pd.read_parquet(f'{data_prefix}/{data_name}', engine = 'pyarrow')

# init parser

In [5]:
from tree_sitter import Language, Parser
import tree_sitter_java as tsjava
import difflib

from tqdm import tqdm

# Initialize the parser and set the Java language
JAVA_LANGUAGE = Language(tsjava.language())
parser = Parser(JAVA_LANGUAGE)

# preview

In [4]:
data_df.columns

Index(['id', 'fromLib', 'toLib', 'repoName', 'repoOwner', 'repoSplitName',
       'prevCommit', 'startCommit', 'endCommit', 'fileName', 'startCode',
       'endCode', 'diff', 'startCommitChanges', 'endCommitChanges',
       'total_added', 'total_removed', 'total_position', 'detailed_changes',
       'startCode_cleaned', 'endCode_cleaned', 'diff_cleaned',
       'lib_percentage'],
      dtype='object')

In [5]:
data_df

Unnamed: 0,id,fromLib,toLib,repoName,repoOwner,repoSplitName,prevCommit,startCommit,endCommit,fileName,...,startCommitChanges,endCommitChanges,total_added,total_removed,total_position,detailed_changes,startCode_cleaned,endCode_cleaned,diff_cleaned,lib_percentage
0,3,ant:ant,org.apache.ant:ant,bobmcwhirter_drools,bobmcwhirter,drools,2ba5d35fb486c4a16b5b8b15fc247e8759bfa54f,92384035a8651b675c82689b24837eb8adb81d66,0df25aead759b3b542a685ac21a3a009bcd22fe9,drools-compiler/src/main/java/org/drools/rule/...,...,+org.mockito:mockito-all,-org.jmock:jmock\n-org.jmock:jmock-legacy,1,3,3,"[{'added_count': 0, 'lib_changes': 0, 'line_ch...",\n\npackage org.drools.rule.builder.dialect.ja...,\n\npackage org.drools.rule.builder.dialect.ja...,"--- \n+++ \n@@ -234,8 +234,6 @@\n consequence....",0.000000
1,4,ant:ant,org.apache.ant:ant,bobmcwhirter_drools,bobmcwhirter,drools,2ba5d35fb486c4a16b5b8b15fc247e8759bfa54f,92384035a8651b675c82689b24837eb8adb81d66,0df25aead759b3b542a685ac21a3a009bcd22fe9,drools-compiler/src/test/java/org/drools/guvno...,...,+org.mockito:mockito-all,-org.jmock:jmock\n-org.jmock:jmock-legacy,16,20,3,"[{'added_count': 1, 'lib_changes': 3, 'line_ch...",package org.drools.guvnor.server.util;\n\nimpo...,package org.drools.guvnor.server.util;\n\nimpo...,"--- \n+++ \n@@ -5,12 +5,10 @@\n import org.dro...",0.083333
3,6,ant:ant,org.apache.ant:ant,bobmcwhirter_drools,bobmcwhirter,drools,2ba5d35fb486c4a16b5b8b15fc247e8759bfa54f,92384035a8651b675c82689b24837eb8adb81d66,0df25aead759b3b542a685ac21a3a009bcd22fe9,drools-compiler/src/test/java/org/drools/integ...,...,+org.mockito:mockito-all,-org.jmock:jmock\n-org.jmock:jmock-legacy,0,1,2,"[{'added_count': 0, 'lib_changes': 0, 'line_ch...",package org.drools.integrationtests;\n\nimport...,package org.drools.integrationtests;\n\nimport...,"--- \n+++ \n@@ -172,7 +172,6 @@\n final Packag...",0.000000
4,7,ant:ant,org.apache.ant:ant,bobmcwhirter_drools,bobmcwhirter,drools,2ba5d35fb486c4a16b5b8b15fc247e8759bfa54f,92384035a8651b675c82689b24837eb8adb81d66,0df25aead759b3b542a685ac21a3a009bcd22fe9,drools-compiler/src/test/java/org/drools/integ...,...,+org.mockito:mockito-all,-org.jmock:jmock\n-org.jmock:jmock-legacy,5,8,5,"[{'added_count': 1, 'lib_changes': 0, 'line_ch...",package org.drools.integrationtests;\n\nimport...,package org.drools.integrationtests;\n\nimport...,"--- \n+++ \n@@ -535,9 +535,8 @@\n session.getA...",0.000000
6,9,ant:ant,org.apache.ant:ant,bobmcwhirter_drools,bobmcwhirter,drools,2ba5d35fb486c4a16b5b8b15fc247e8759bfa54f,92384035a8651b675c82689b24837eb8adb81d66,0df25aead759b3b542a685ac21a3a009bcd22fe9,drools-compiler/src/test/java/org/drools/integ...,...,+org.mockito:mockito-all,-org.jmock:jmock\n-org.jmock:jmock-legacy,17,32,14,"[{'added_count': 5, 'lib_changes': 4, 'line_ch...",package org.drools.integrationtests;\n\n\n\nim...,package org.drools.integrationtests;\n\n\n\nim...,"--- \n+++ \n@@ -1,6 +1,11 @@\n package org.dro...",0.204082
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80666,151702,ru.yandex.qatools.properties:properties-loader,ru.qatools.commons:properties,yandex-qatools_hamcrest-pojo-matcher-generator,yandex-qatools,hamcrest-pojo-matcher-generator,13ac1e25cc715b6855890070099a3cdbea36cf56,cc7a474d3ea53164d28b4a1b23c25989982e7d03,cc7a474d3ea53164d28b4a1b23c25989982e7d03,feature-matcher-generator/src/test/java/ru/yan...,...,+ru.qatools.commons:properties\n-ru.yandex.qat...,+ru.qatools.commons:properties\n-ru.yandex.qat...,1,11,4,"[{'added_count': 0, 'lib_changes': 1, 'line_ch...",package ru.yandex.qatools.processors.matcher.g...,package ru.yandex.qatools.processors.matcher.g...,"--- \n+++ \n@@ -4,7 +4,6 @@\n import org.junit...",0.166667
80667,151703,ru.yandex.qatools.properties:properties-loader,ru.qatools.commons:properties,yandex-qatools_hamcrest-pojo-matcher-generator,yandex-qatools,hamcrest-pojo-matcher-generator,13ac1e25cc715b6855890070099a3cdbea36cf56,cc7a474d3ea53164d28b4a1b23c25989982e7d03,cc7a474d3ea53164d28b4a1b23c25989982e7d03,feature-matcher-generator/src/test/java/ru/yan...,...,+ru.qatools.commons:properties\n-ru.yandex.qat...,+ru.qatools.commons:properties\n-ru.yandex.qat...,7,7,5,"[{'added_count': 2, 'lib_changes': 4, 'line_ch...",package ru.yandex.qatools.processors.matcher.g...,package ru.yandex.qatools.processors.matcher.g...,"--- \n+++ \n@@ -10,14 +10,14 @@\n import org.m...",0.285714
80668,151704,ru.yandex.qatools.properties:properties-loader,ru.qatools.commons:properties,yandex-qatools_hamcrest-pojo-matcher-generator,yandex-qatools,hamcrest-pojo-matcher-generator,13ac1e25cc715b6855890070099a3cdbea36cf56,cc7a474d3ea53164d28b4a1b23c25989982e7d03,cc7a474d3ea53164d28b4a1b23c25989982e7d03,feature-matcher-generator/src/test/java/ru/yan...,...,+ru.qatools.commons:properties\n-ru.yandex.qat...,+ru.qatools.commons:properties\n-ru.yandex.qat...,6,6,5,"[{'added_count': 0, 'lib_changes': 1, 'line_ch...",package ru.yandex.qatools.processors.matcher.g...,package ru.yandex.qatools.processors.matcher.g...,"--- \n+++ \n@@ -1,6 +1,5 @@\n package ru.yande...",0.333333
80669,151705,ru.yandex.qatools.properties:properties-loader,ru.qatools.commons:properties,yandex-qatools_hamcrest-pojo-matcher-generator,yandex-qatools,hamcrest-pojo-matcher-generator,13ac1e25cc715b6855890070099a3cdbea36cf56,cc7a474d3ea53164d28b4a1b23c25989982e7d03,cc7a474d3ea53164d28b4a1b23c25989982e7d03,feature-matcher-generator/src/test/java/ru/yan...,...,+ru.qatools.commons:properties\n-ru.yandex.qat...,+ru.qatools.commons:properties\n-ru.yandex.qat...,7,10,5,"[{'added_count': 2, 'lib_changes': 0, 'line_ch...",package ru.yandex.qatools.processors.matcher.g...,package ru.yandex.qatools.processors.matcher.g...,"--- \n+++ \n@@ -67,12 +67,9 @@\n \n \n when(he...",0.000000


In [12]:
print(data_df.iloc[154550]['method_after'])




In [108]:
def print_tree(node, indent = 0):
    """
    Recursively prints the tree structure of a tree-sitter node.

    :param node: The current tree-sitter node.
    :param source_code: The source code being parsed, as bytes.
    :param indent: The current indentation level for pretty-printing.
    """
    # Extract node information
    node_type = node.type
    node_text = node.text.decode('utf-8')

    node_text = [line for line in node_text.split('\n') if line.strip()]

    # Print the node details with indentation
    print(f"{'  ' * indent}{node_type}: '{node_text}'")

    # Recursively print child nodes
    for child in node.children:
        print_tree(child, indent + 1)

def extract_method_details(node, class_name, source_code):
    method_details = {
        'class_name': class_name,
        'body': '',
        'modifiers': '',
        'return_type': '',
        'name': '',
        'parameters': '',
        'signature': '',
    }

    start_byte, end_byte = node.start_byte, node.end_byte
    method_details['body'] = source_code[start_byte:end_byte].decode('utf-8', errors = 'replace')

    for child in node.children:
        if (child.type == 'modifiers'):
            method_details['modifiers'] = ' '.join([modifier.text.decode('utf-8') for modifier in child.children])
        elif ('type' in child.type):  # Return type
            method_details['return_type'] = child.text.decode('utf-8')
        elif (child.type == 'identifier'):  # Method name
            method_details['name'] = child.text.decode('utf-8')
        elif (child.type.endswith('parameters')):  # Parameter list
            param_string = ', '.join([param.text.decode('utf-8') for param in child.children if param.type.endswith('parameter')])
            method_details['parameters'] = param_string

    method_details['signature'] = f'{method_details["modifiers"]} {method_details["return_type"]} {method_details["name"]}({method_details["parameters"]})'
    method_details['signature_no_mod'] = f'{method_details["return_type"]} {method_details["name"]}({method_details["parameters"]})'

    return method_details

def extract_methods_with_body(java_code):
    # print('java_code :',java_code)

    def has_errors(node):
        if node.type == 'ERROR':
            print(node.text.decode('utf-8'))
            return True
        return any(has_errors(child) for child in node.children)

    def dfs_find_methods(node: Any, encoded_code, class_context: List[str] = None, ) -> List[Dict[str, Any]]:
        '''Perform DFS to find all method_declaration nodes and their enclosing class hierarchy.'''
        if class_context is None:
            class_context = []

        method_details = []

        # If the node is a class, update the class context
        if node.type == 'class_declaration':
            class_name = None
            for child in node.children:
                if child.type == 'identifier':  # Class name
                    class_name = child.text.decode('utf-8')
                    break
            if class_name:
                class_context.append(class_name)

        # If the node is a method, extract its details with the full class hierarchy
        if node.type == 'method_declaration':
            full_class_name = '.'.join(class_context)  # Concatenate class names to show the hierarchy
            method_details.append(extract_method_details(node = node, class_name = full_class_name, source_code = encoded_code,))

        # Recursively process all children
        for child in node.children:
            method_details.extend(dfs_find_methods(child, encoded_code, class_context[:]))  # Pass a copy of class context

        # If the node is a class, pop the class name after processing its children
        if node.type == 'class_declaration' and class_context:
            class_context.pop()

        return method_details

    try:
        encoded_code = java_code.encode('utf-8')
        tree = parser.parse(encoded_code)
        root_node = tree.root_node

        if (has_errors(root_node)):
            raise Exception('Parsing errors found in the code')

        return dfs_find_methods(root_node, encoded_code)
    except Exception as e:
        print('Loi :' ,e)
        return None

# Function to remove comments
def remove_comments(java_code: str) -> str:
    # Parse the code
    tree = parser.parse(java_code.encode('utf-8'))
    root_node = tree.root_node

    # Gather ranges of comment nodes
    comment_ranges = []
    def visit_node(node):
        if node.type in {'line_comment', 'block_comment'}:
            comment_ranges.append((node.start_byte, node.end_byte, node.type))
        for child in node.children:
            visit_node(child)

    visit_node(root_node)

    # Remove comments by excluding their byte ranges
    result_code = bytearray(java_code, 'utf-8')
    for start, end, comment_type in reversed(comment_ranges):  # Reverse to avoid shifting indices
        if comment_type == 'block_comment':
            # Replace block comment with spaces
            result_code[start:end] = b' ' * (end - start + 1)
            # del result_code[start:end]
        else:
            # Remove line comments entirely
            result_code[start:end] = b' ' * (end - start + 1)
        # del result_code[start:end]

    return result_code.decode('utf-8').strip()

def diff_methods(methods_start, methods_end):
    '''
    Compare methods based on their full dictionaries (e.g., name, signature, body).
    '''
    # Normalize methods for comparison
    def normalize_methods(methods):
        res = []
        for method in methods:
            sub_method = {}

            sub_method['class_name'] = method['class_name'].strip()
            sub_method['name'] = method['name'].strip()
            sub_method['body'] = method['body'].strip()
            sub_method['modifiers'] = method['modifiers'].strip()
            sub_method['return_type'] = method['return_type'].strip()
            sub_method['parameters'] = method['parameters'].strip()
            sub_method['signature'] = method['signature'].strip()
            sub_method['signature_no_mod'] = method['signature_no_mod'].strip()

            res.append(sub_method)

        return res

    normalized_start = normalize_methods(methods_start)
    normalized_end = normalize_methods(methods_end)
    # normalized_start = [{key: method[key].strip() if isinstance(method[key], str) else method[key] for key in method} for method in methods_start]
    # normalized_end = [{key: method[key].strip() if isinstance(method[key], str) else method[key] for key in method} for method in methods_end]

    # Convert lists of methods to sets of frozensets for comparison
    set_start = set(frozenset(item.items()) for item in normalized_start)
    set_end = set(frozenset(item.items()) for item in normalized_end)

    # Determine differences
    removed_methods = [dict(items) for items in (set_start - set_end)]  # Methods in start but not in end
    added_methods = [dict(items) for items in (set_end - set_start)]    # Methods in end but not in start
    unchanged_methods = [dict(items) for items in (set_start & set_end)]  # Methods in both

    return {
        'removed': removed_methods,
        'added': added_methods,
        'unchanged': unchanged_methods,
    }

def check_same_methods(method1, method2):
    '''
    Check if two methods are the same based on their full dictionaries.
    '''
    return method1['signature_no_mod'] == method2['signature_no_mod']

In [117]:
for id in range(len(data_df)):
    # if (id != 8219):
    #     continue

    id = 8219
    sample = data_df.iloc[id]

    start_code = sample['startCode']
    end_code = sample['endCode']

    with open('before.txt', 'w') as f:
        f.write(start_code)
    with open('after.txt', 'w') as f:
        f.write(end_code)

    start_code_cleaned = remove_comments(start_code)
    end_code_cleaned = remove_comments(end_code)

    with open('before_no_comment.txt', 'w') as f:
        f.write(start_code_cleaned)
    with open('after_no_comment.txt', 'w') as f:
        f.write(end_code_cleaned)

    # with open('before.txt', 'w') as f:
    #     f.write(start_code)

    # print(start_code)

    method_start = extract_methods_with_body(start_code)
    method_end = extract_methods_with_body(end_code)

    # print(method_start)
    # print(method_end)

    # print(len(method_start))
    # print(len(method_end))

    break

# start_code

_(Class<? extends SubView> cls) {
Loi : Parsing errors found in the code
_(Class<? extends SubView> cls) {
Loi : Parsing errors found in the code
