<a href="https://colab.research.google.com/github/cathieG/N_Gram/blob/main/Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Josh_folder_name = ""
Cathy_folder_name = "Gen AI for Soft Dev"

folder_path = f"/content/drive/My Drive/{Cathy_folder_name}" # Change if needed

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
# Removing exact clones

### Type 1 Clones ###
def remove_duplicates(data):
    """Remove duplicate methods based on method content.
      Almost Type-1 with the exception of comments
    """
    return data.drop_duplicates(subset="Method Code", keep="first")

In [15]:
def filter_ascii_methods(data):
    """Filter methods to include only those with ASCII characters."""
    data = data[data["Method Code"].apply(lambda x: all(ord(char) < 128 for char in x))]
    return data

In [16]:
# Three Approaches:
# 	1.	Data Distribution-Based Filtering: We eliminate outliers by analyzing the original data distribution, as demonstrated below.
# 	2.	Literature-Driven Filtering: We follow best practices outlined in research, such as removing methods exceeding 512 tokens in length.
# 	3.	Hybrid Approach: We combine elements from both the distribution-based and literature-driven methods.

def remove_outliers(data, lower_percentile=5, upper_percentile=95):
    """Remove outliers based on method length."""
    method_lengths = data["Method Code"].apply(len)
    lower_bound = method_lengths.quantile(lower_percentile / 100)
    upper_bound = method_lengths.quantile(upper_percentile / 100)
    return data[(method_lengths >= lower_bound) & (method_lengths <= upper_bound)]

In [17]:
import re
import os
import pandas as pd

# Get a list of all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

pd_list = []

for file in csv_files:

    file_path = os.path.join(folder_path, file)

    # Check if the file is empty
    if os.stat(file_path).st_size == 0:
        print(f"Skipping empty file: {file}")
        continue

    try:
        df = pd.read_csv(file_path, usecols=[4])  # Read the CSV file into a DataFrame

        #if the DataFrame is empty (only header row, no data row)
        if df.empty:
            print(f"Skipping file with only header row.")
            continue
        pd_list.append(df)  # Append the DataFrame to the list
    except pd.errors.EmptyDataError:
        print(f"No columns to parse in file.")
    except Exception as e:
        print(f"Error reading file:{e}")

# Concatenate all DataFrames in the list into a single DataFrame
combined_df = pd.concat(pd_list, ignore_index=True)

print(len(combined_df))

combined_df.head()

Skipping file with only header row.
Skipping file with only header row.
Skipping file with only header row.
Skipping file with only header row.
Skipping file with only header row.
Skipping file with only header row.
Skipping file with only header row.
Skipping file with only header row.
Skipping file with only header row.
Skipping file with only header row.
Skipping file with only header row.
Skipping file with only header row.
Skipping file with only header row.
Skipping file with only header row.
Skipping file with only header row.
Skipping file with only header row.
Skipping file with only header row.
Skipping file with only header row.
Skipping file with only header row.
Skipping file with only header row.
Skipping file with only header row.
Skipping file with only header row.
Skipping file with only header row.
Skipping file with only header row.
Skipping file with only header row.
Skipping file with only header row.
Skipping file with only header row.
Skipping file with only head

Unnamed: 0,Method Code
0,public static void main(String[] args) {\n...
1,SessionRegistry sessionRegistry() {\n ...
2,public ApplicationRunner applicationRunner...
3,public ModelAndView login(){\n Mode...
4,public ModelAndView index(){\n Mode...


In [18]:
from pygments.lexers.jvm import JavaLexer
from pygments.lexers import get_lexer_by_name
from pygments.token import Token

def remove_boilerplate_methods(data):
    """Remove boilerplate methods like setters and getters."""
    boilerplate_patterns = [
        r"\bset[A-Z][a-zA-Z0-9_]*\(.*\)\s*{",  # Setter methods
        r"\bget[A-Z][a-zA-Z0-9_]*\(.*\)\s*{",  # Getter methods
    ]
    boilerplate_regex = re.compile("|".join(boilerplate_patterns))
    data = data[~data["Method Code"].apply(lambda x: bool(boilerplate_regex.search(x)))]
    return data


def remove_comments_from_dataframe(df: pd.DataFrame, method_column: str, language: str) -> pd.DataFrame:
    """
    Removes comments from Java methods in a DataFrame and adds a new column with cleaned methods.

    Args:
        df (pd.DataFrame): DataFrame containing the methods.
        method_column (str): Column name containing the raw Java methods.
        language (str): Programming language for the lexer (e.g., 'java').

    Returns:
        pd.DataFrame: Updated DataFrame with a new column 'Java Method No Comments'.
    """
    # Define a function to remove comments from a single method
    def remove_comments(code):
        lexer = get_lexer_by_name(language)
        tokens = lexer.get_tokens(code)
        # Filter out comments using a lambda function
        clean_code = ''.join(token[1] for token in tokens if not (lambda t: t[0] in Token.Comment)(token))


        return clean_code

    # Apply the function to the specified column and add a new column with the results
    df["Method Java No Comments"] = df[method_column].apply(remove_comments)
    return df

In [19]:
#Data preprocessing

print("Initial dataset size:", len(combined_df))
combined_df = remove_duplicates(combined_df)
print("After removing duplicates:", len(combined_df))

combined_df = filter_ascii_methods(combined_df)
print("After filtering ASCII methods:", len(combined_df))

combined_df = remove_outliers(combined_df)
print("After removing outliers:", len(combined_df))

combined_df = remove_boilerplate_methods(combined_df)
print("After removing boilerplate methods:", len(combined_df))

combined_df = remove_comments_from_dataframe(combined_df, "Method Code", "Java")
print("After cleaning comments:", len(combined_df))

combined_df.head()

Initial dataset size: 2433961
After removing duplicates: 834555
After filtering ASCII methods: 782659
After removing outliers: 704915
After removing boilerplate methods: 504358
After cleaning comments: 504358


Unnamed: 0,Method Code,Method Java No Comments
0,public static void main(String[] args) {\n...,public static void main(String[] args) {\n...
1,SessionRegistry sessionRegistry() {\n ...,SessionRegistry sessionRegistry() {\n ...
2,public ApplicationRunner applicationRunner...,public ApplicationRunner applicationRunner...
6,public ModelAndView logging() {\n r...,public ModelAndView logging() {\n r...
8,public Object around(ProceedingJoinPoint p...,public Object around(ProceedingJoinPoint p...


In [20]:
method_df = combined_df[["Method Java No Comments"]]
method_df.head()

Unnamed: 0,Method Java No Comments
0,public static void main(String[] args) {\n...
1,SessionRegistry sessionRegistry() {\n ...
2,public ApplicationRunner applicationRunner...
6,public ModelAndView logging() {\n r...
8,public Object around(ProceedingJoinPoint p...


In [21]:
lexer = JavaLexer()

def tokenize_java_method(code):
    lexer = JavaLexer()
    tokens = [t[1] for t in lexer.get_tokens(code) if t[0] not in Token.Text]  # Remove unnecessary whitespace tokens
    return tokens

# Apply tokenization to the DataFrame
method_df["Tokens"] = method_df["Method Java No Comments"].apply(tokenize_java_method)

# Print the DataFrame to see the results
method_df.head()

Unnamed: 0,Method Java No Comments,Tokens
0,public static void main(String[] args) {\n...,"[public, static, void, main, (, String, [, ], ..."
1,SessionRegistry sessionRegistry() {\n ...,"[SessionRegistry, sessionRegistry, (, ), {, re..."
2,public ApplicationRunner applicationRunner...,"[public, ApplicationRunner, applicationRunner,..."
6,public ModelAndView logging() {\n r...,"[public, ModelAndView, logging, (, ), {, retur..."
8,public Object around(ProceedingJoinPoint p...,"[public, Object, around, (, ProceedingJoinPoin..."


In [22]:
#Saving results to csv

method_df.to_csv('/content/drive/My Drive/Gen AI for Soft Dev/Tokenized/mydata.csv', index = False)