In [None]:
import os
import json
import torch
import pickle
from tqdm.auto import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelWithLMHead


def get_perplexity(text, stride):
    encodings = TOKENIZER(text, return_tensors="pt").to(DEVICE)
    max_length = MODEL.config.n_positions
    seq_len = encodings.input_ids.size(1)
    nlls = []
    prev_end_loc = 0
    # for begin_loc in tqdm(range(0, seq_len, stride)):
    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(DEVICE)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100
        with torch.no_grad():
            outputs = MODEL(input_ids, labels=target_ids)
            neg_log_likelihood = outputs.loss
        nlls.append(neg_log_likelihood)
        prev_end_loc = end_loc
        if end_loc == seq_len:
            break
    ppl = torch.exp(torch.stack(nlls).mean())
    return ppl

def save_github_code_eval_subset(language, license, size=1000, shuffle=True):
    """
    Saves a subset of the github code dataset to be used for evaluation.
    :param language: Programming language of the code
    :param license: License of the code
    :param size: Size of the subset
    :param shuffle: Whether to shuffle the dataset before taking the subset
    :return: None
    """
    ds = load_dataset("codeparrot/github-code", languages=[language], licenses=[license], streaming=True, split="train")
    if shuffle:
        ds = ds.shuffle(buffer_size=size)
    ds = ds.take(size)
    evaluation_dataset = []

    for item in tqdm(ds, total=size):
        evaluation_dataset.append(item['code'])

    eval_data_name = f"./data/evaluation_data_{language}_{license}_{size}.pkl"

    with open(eval_data_name, 'wb') as f:
        pickle.dump(evaluation_dataset, f)


def load_saved_github_code_eval_subset(language, license, size=1000):
    """
    Loads a subset of the github code dataset to be used for evaluation.
    :param language: Programming language of the code
    :param license: License of the code
    :param size: Size of the subset
    :return: None
    """
    eval_data_name = f"./data/evaluation_data_{language}_{license}_{size}.pkl"
    if not os.path.exists(eval_data_name):
        print(f"Saved evaluation data not found. Saving now...")
        save_github_code_eval_subset(language, license, size)
    with open(eval_data_name, 'rb') as f:
        evaluation_dataset = pickle.load(f)
    return evaluation_dataset


def save_results_dict(results_dict, language, license, stride):
    """
    Saves a results dictionary to be used for evaluation to a json file.
    :param results_dict: Dictionary containing the results
    :param language: Programming language of the code
    :param license: License of the code
    :param size: Size of the subset
    :return: None
    """
    results_name = f"./results/results_{language}_{license}_{stride}.json"
    with open(results_name, 'w') as f:
        json.dump(results_dict, f)
    print(f"Saved results to {results_name}")

    short_results_name = f"./results/results_{language}_{license}_{stride}_short.json"
    # Delete text to save space
    del results_dict['text']
    with open(short_results_name, 'w') as f:
        json.dump(results_dict, f)
    print(f"Saved short results to {short_results_name}")


def main(languages, licenses, size, stride, n_samples=None):
    for language in languages:
        for license in licenses:
            print('='*50)
            print(f"Language: {language}, License: {license}")

            # Load evaluation data
            eval_data = load_saved_github_code_eval_subset(language, license, size)
            print(f"Loaded {len(eval_data)} samples for evaluation.")
            

            # Initialize results dictionary
            results = {'text': [], 'perplexity': []}

            # Set number of samples to evaluate
            n_samples = len(eval_data) if n_samples is None else n_samples
            ppl = 0
            tbar = tqdm(range(n_samples),total=n_samples, unit='Sample', position=0, leave=True, desc=f'Current perplexity: {ppl:.2f}')
            
            # Evaluate
            for i in tbar:
                text = eval_data[i]
                # Skip the first 500 characters
                text = text[500:]

                if len(text) > 5000:
                    # Skip long samples
                    continue


                ppl = get_perplexity(text, stride)
                results['text'].append(text[100:])
                results['perplexity'].append(ppl.item())
                tbar.set_description(f'Current perplexity: {ppl:.2f}')
                tbar.refresh()
                
            # Save results
            results['avg_perplexity'] = sum(results['perplexity'])/len(results['perplexity'])
            save_results_dict(results, language, license, stride)
            print(f"Average perplexity: {results['avg_perplexity']:.2f}")
            print('='*50)


In [None]:
global DEVICE
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "Salesforce/codegen-350M-mono"
global MODEL
MODEL = AutoModelWithLMHead.from_pretrained(model_name).to(DEVICE)
global TOKENIZER
TOKENIZER = AutoTokenizer.from_pretrained(model_name)


In [None]:
languages = ['Python', 'Java']
licenses = ['mit', 'apache-2.0']
size = 10000
stride = 1024
n_samples = 3000
main(languages, licenses, size, stride, n_samples)