# Database Analysis

In [12]:
import lmdb
import pandas as pd
import matplotlib.pyplot as plt
import tempfile
import numpy as np
import seaborn as sns
from tqdm import tqdm

In [13]:
tqdm.pandas()

## Tokenize Code

We will use TreeSitter to tokenize our Java code.

In [None]:
!mkdir -p ./third_party
!cd third_party && git clone https://github.com/tree-sitter/tree-sitter-java.git

In [None]:
from tree_sitter import Language
Language.build_library('./language-parser.so', ["third_party/tree-sitter-java"])

In [None]:
from codegen.preprocessing.lang_processors.java_processor import JavaProcessor
jprocessor = JavaProcessor(root_folder='./third_party')

#### Load Dataset

In [15]:
from dataset_utils import get_or_create_df

df = get_or_create_df(df_path='/sailhome/abaveja/df.feather')
preserved_cols = ['focal_class.identifier', 'focal_method.cm_signature', 'focal_method.body', 'repository.url']

In [61]:
tests_per_method = df.groupby(['repository.url', 'focal_method.cm_signature']).size()

#### Tokenize Loaded Functions

In [77]:
methods = df[preserved_cols].groupby(['repository.url', 'focal_method.cm_signature']).first()

In [78]:
def tokenize(row):
    return jprocessor.tokenize_code(
        row['focal_method.body'],
        process_strings=True,
        keep_comments=False
    )
    
methods['focal_method.tokenized_body'] = methods.progress_apply(tokenize, axis=1)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 403476/403476 [02:17<00:00, 2932.53it/s]


In [21]:
methods.head(2)

Unnamed: 0_level_0,focal_class.identifier,focal_method.body,repository.url,focal_method.tokenized_body
focal_method.cm_signature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A.main(final String[] args),A,public static void main(final String[] args) t...,https://github.com/chirauraNoSakusha/chiraura,"[public, static, void, main, (, final, String,..."
"A1.go(Integer[] arrays, Integer target)",A1,"public static Integer[] go(Integer[] arrays, I...",https://github.com/leishiguang/basic-of-java,"[public, static, Integer, [, ], go, (, Integer..."


## Remove Duplicates
We use the following algorithm to remove duplicates
> 1. Calculate pairwise *Jaccard similarity* between all programs in the dataset (make symmetric assumption)
2. Cluster programs based on their Jaccard similarity
3. Define a set of new programs
4. For each cluster, add the program with the highest number of unit tests to the program set. Ties are broken via uniform random sampling among the best-tested programs.
5. Add all programs with no cluster to the set

In [101]:
from dpu_utils.codeutils.deduplication import DuplicateDetector
import random

random.seed(42)

detector = DuplicateDetector(
    set_similarity_threshold=0.8,
    multiset_similarity_threshold=0.7,
    min_num_tokens_per_document=20
)

for method_key, method in tqdm(methods.iterrows(), total=len(methods)):
    detector.add_file(
        id='\0'.join(method_key), # use null ascii char because that will never be in signature or repo URL
        tokens=method['focal_method.tokenized_body'],
        language='java'
    )

clusters = detector.compute_duplicates()  # List[Set[example_id]]
detector.print_clone_set_stats(clusters)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 403476/403476 [00:40<00:00, 9991.81it/s]


Duplicated files: 18.57%
Avg num of files per clones 2.50
Median num of files per clones 2.0
Duplication Ratio 11.13%


#### Deduplication Heuristic
To Deduplicate Code Clusters, we will only preserve the cluster with the highest number of tests. Ties will be broken by taking a random sample from the maximum category.

In [133]:
from collections import defaultdict

centroids = set()
# Take a 
for c in clusters:
    counts = defaultdict(list)
    for ex_id in c:
        key = tuple(ex_id.split('\0'))
        counts[tests_per_method[key]].append(key)
        
    max_key = max(counts.keys())
    centroid = random.choice(list(counts[max_key]))
    centroids.add(centroid)
    
clustered_ex_ids = {ex_id for c in clusters for ex_id in c}
for method_key, method in tqdm(methods.iterrows(), total=len(methods)):
    joint_key = '\0'.join(method_key)
    if joint_key not in clustered_ex_ids:
        centroids.add(joint_key)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 403476/403476 [00:12<00:00, 33131.19it/s]


In [131]:
print("Statistics")
print("-"*30)
print("Number of methods", len(methods))
print("Number of clusters", len(centroids))
print("Duplicate Percentage", f"{round((1-(len(centroids)/len(methods)))*100, 2)}%")

Statistics
------------------------------
Number of methods 403476
Number of clusters 383975
Duplicate Percentage 4.83%


## Load Query Results

In [140]:
query_results = lmdb.open('/scr/abaveja/query_cache_matx1/', readonly=True)

In [142]:
import pickle

data = []

with query_results.begin() as txn:
    for key, value in txn.cursor():
        value = pickle.loads(value)

        print(key, value)
    

df = pd.DataFrame(data, columns=['column_name'])

# Close the LMDB environment
env.close()

# Print the DataFrame
print(df)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



b'VpcManagerImpl.checkCapabilityPerServiceProvider(final Set<Provider> providers, final Capability capability,\n                                                     final Service service)_VpcManagerImplTest.testCheckCapabilityPerServiceProvider()' {'field_accesses': 2, 'has_fixtures': True, 'diff_class_unknown_invoc': 2, 'branch_count': 0, 'special_count': 0, 'halstead_length': 49, 'cyclomatic_complexity': 1, 'efferent_coupling': 4, 'afferent_coupling': 0, 'maintainability_index': 10.621771194802733, 'gsig': 'testCheckCapabilityPerServiceProvider()'}
b'VpcManagerImpl.checkCapabilityPerServiceProvider(final Set<Provider> providers, final Capability capability,\n                                                     final Service service)_VpcManagerImplTest.testCheckCapabilityPerServiceProviderFail()' {'field_accesses': 1, 'has_fixtures': True, 'diff_class_unknown_invoc': 2, 'branch_count': 0, 'special_count': 0, 'halstead_length': 40, 'cyclomatic_complexity': 1, 'efferent_coupling': 4, 'a

NameError: name 'env' is not defined

In [139]:
query_results.close()