In [57]:
from transformers import pipelines
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import datasets
from transformers import RobertaTokenizer,RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, concatenate_datasets
import re


In [35]:

def load_and_split_data(language):
    # Load the dataset
    dataset = load_dataset('code_search_net', language, split=['train', 'validation', 'test'], trust_remote_code=True)

    # Combine train and validation datasets
    combined_train = concatenate_datasets([dataset[0], dataset[1]])

    # Split the combined dataset into 75% training and 25% testing
    train_test_split = combined_train.train_test_split(test_size=0.25, seed=42)
    train_dataset = train_test_split['train']
    test_dataset = train_test_split['test']

    # Split the training dataset into 90% training and 10% validation
    train_validation_split = train_dataset.train_test_split(test_size=0.1, seed=42)
    train_dataset = train_validation_split['train']
    validation_dataset = train_validation_split['test']
    
    return train_dataset, validation_dataset, test_dataset


In [73]:
#tokenize the datasets return tensorflow tensors
def preprocess_tokenizeation(dataset):
    tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
    def preprocess_function(inputs):
        code_strings = inputs['func_code_string']
        return tokenizer(code_strings,truncation = True,  padding='max_length', max_length=512, return_tensors="tf")
    tokenized_data = dataset.map(preprocess_function,batched=True)

    
    return tokenized_data

#function converts tokenized data to a tf dataset for performance
def to_tf(tokenized_data, batch_size=16):
    # Define the columns to use
    columns = ["input_ids", "attention_mask"]
    if "label" in tokenized_data.column_names:
        columns.append("label")

    # Print columns to debug
    print(f"Columns in dataset: {tokenized_data.column_names}")
    print(f"Columns to use: {columns}")

    # Print the first few examples to debug
    for i, example in enumerate(tokenized_data):
        if i < 5:  # Limit to the first 5 examples
            print(f"Example {i}: {example}")

    # Define a collate function that handles the possible absence of 'label'
    def collate_fn(features):
        collated = {}
        for column in columns:
            if column in features[0]:
                collated[column] = [feature[column] for feature in features]
            else:
                if column == "label":
                    collated[column] = [None] * len(features)
                else:
                    raise ValueError(f"Missing column {column} in example {features[0]}")
        return collated

    # Convert the dataset to TensorFlow format
    return tokenized_data.to_tf_dataset(
        columns=columns,
        shuffle=True,
        batch_size=batch_size,
        collate_fn=collate_fn
    )

In [37]:
# calls the above functions for the python and js datasets and then tokenizes them and converts them to tensorflow datasets
python_train, python_validation,python_test = load_and_split_data("python")
js_dataset = load_and_split_data('javascript')



In [46]:
print(np.asarray(python_train))
#remove all comments from the code 


[{'repository_name': 'gwastro/pycbc', 'func_path_in_repository': 'pycbc/types/frequencyseries.py', 'func_name': 'FrequencySeries.match', 'whole_func_string': 'def match(self, other, psd=None,\n              low_frequency_cutoff=None, high_frequency_cutoff=None):\n        """ Return the match between the two TimeSeries or FrequencySeries.\n\n        Return the match between two waveforms. This is equivelant to the overlap\n        maximized over time and phase. By default, the other vector will be\n        resized to match self. Beware, this may remove high frequency content or the\n        end of the vector.\n\n        Parameters\n        ----------\n        other : TimeSeries or FrequencySeries\n            The input vector containing a waveform.\n        psd : Frequency Series\n            A power spectral density to weight the overlap.\n        low_frequency_cutoff : {None, float}, optional\n            The frequency to begin the match.\n        high_frequency_cutoff : {None, float}

In [56]:


def remove_comments(code):
    # Remove single-line comments (e.g., // in JavaScript or # in Python)
    code = re.sub(r'#.*', '', code)
    code = re.sub(r'//.*', '', code)
    # Remove multi-line comments (e.g., /* */ in JavaScript)
    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
    return code


0    {'repository_name': 'gwastro/pycbc', 'func_pat...
Name: 0, dtype: object


In [64]:
python_train_tok = preprocess_tokenizeation(python_train)
python_valid_tok = preprocess_tokenizeation(python_validation)
python_test_tok = preprocess_tokenizeation(python_test)
#js_dataset_tokenized = preprocess_tokenizeation(js_dataset)


#js_Dset_tok_tf = to_tf(js_dataset_tokenized)


In [74]:
python_train_tf= to_tf(python_train_tok)
python_valid_tf= to_tf(python_valid_tok)
python_test_tf= to_tf(python_test_tok)


Columns in dataset: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url', 'input_ids', 'attention_mask']
Columns to use: ['input_ids', 'attention_mask']
Example 0: {'repository_name': 'gwastro/pycbc', 'func_path_in_repository': 'pycbc/types/frequencyseries.py', 'func_name': 'FrequencySeries.match', 'whole_func_string': 'def match(self, other, psd=None,\n              low_frequency_cutoff=None, high_frequency_cutoff=None):\n        """ Return the match between the two TimeSeries or FrequencySeries.\n\n        Return the match between two waveforms. This is equivelant to the overlap\n        maximized over time and phase. By default, the other vector will be\n        resized to match self. Beware, this may remove high frequency content or the\n        end of the vector.\n\n        Parameters\n        ----------\n        ot

In [None]:
#the actual fine-tuning of the Roberta model is here
model = RobertaForSequenceClassification.from_pretrained('microsoft/codebert-base')
