## General Experiment Structure
* Using different models
* Run on same sample for all models
* Evaluate outputs after

#### Pseudocode
for every model{
    generate 10 responses for all samples(for Acc@10)
    store samples properly
    
}

for each model's generated samples{
    calculate metrics (CSR, CodeBLEU, EM, Acc@10, Edit Similarity, Line Coverage, Branch Coverage)
}

## Importing Libraries

In [1]:
import argparse
import multiprocessing
import os
import pandas as pd
import time
import torch
import transformers
#from accelerate import PartialState
from datasets import Dataset, load_dataset
from peft import LoraConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    logging,
    set_seed,
    pipeline,
)
from trl import SFTTrainer


  from .autonotebook import tqdm as notebook_tqdm


## Testing DeepSeek Loading

In [2]:
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct")

# config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
lora_config = LoraConfig(
    r=8,
    target_modules=[
        "q_proj",
        "o_proj",
        "k_proj",
        "v_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    task_type="CAUSAL_LM",
)
# load model and dataset
token = os.environ.get("HF_TOKEN", None)
# print out selected device
#print(PartialState().process_index)
#Check for GPU availability
# print(torch.cuda.is_available())
# if torch.cuda.is_available():
#     print("GPU")
#     with torch.device("cuda:2"):
#         model = AutoModelForCausalLM.from_pretrained(
#             "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
#             quantization_config=bnb_config,
#             attention_dropout=0.1,
#             device_map=None, #Prevents auto device mapping - not really
#             trust_remote_code=True,
#         )
#         model.to("cuda:2")
# else:
#     #Handle no GPU availiability
#     print("No GPU")
#     with torch.device("cpu"):
#         model = AutoModelForCausalLM.from_pretrained(
#             "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
#             quantization_config=bnb_config,
#             attention_dropout=0.1,
#             device_map=None, #Prevents auto device mapping - not really
#             trust_remote_code=True,
#         )
#         model.to("cpu")

model = AutoModelForCausalLM.from_pretrained(
    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
    quantization_config=bnb_config,
    attention_dropout=0.1,
    device_map=None, #Prevents auto device mapping - not really
    trust_remote_code=True,
)
#if torch.cuda.device_count() > 1:
#    print("Using DataParallel for multiple GPUs")
#    model = torch.nn.DataParallel(model)

#model.to(device)
# Freeze all except embeddings and first layer
#for name, param in model.named_parameters():
#    if "model.embed_tokens" not in name and "model.layers.0" not in name:
#        param.requires_grad = False
#    else:
#        param.requries_grad = True

`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 4/4 [00:55<00:00, 13.82s/it]


## Testing Dataset Loading

In [3]:
# print(args.dataset_name)
#     print(args.subset)
#     print(args.split)
#     print(token)
#     print(args.num_proc if args.num_proc else multiprocessing.cpu_count())
    
#     '''Original file from hugging face hub
#     data = load_dataset(
#         args.dataset_name,
#         data_files=args.subset,
#         split=args.split,
#         token=token,
#         num_proc=args.num_proc if args.num_proc else multiprocessing.cpu_count(),
#     )'''

#     data = load_dataset(
#         'parquet', 
#         data_files=args.subset, 
#         split=args.split,
#         num_proc=args.num_proc if args.num_proc else multiprocessing.cpu_count(),
#     )

#     print('Dataset type: ', type(data))

In [4]:
import os
import json
import pandas as pd
from glob import glob
import numpy

# Define the base directories
base_dirs = ['train', 'eval', 'test']

for i in range(len(base_dirs)):
    base_dirs[i] = '../data/methods2test_data/' + base_dirs[i]
# List to collect all data (rows)
data = []

total_exmaples = 0

is_broken = False
base_dir = '../data/methods2test_data/train'
# Iterate over all subdirectories (examples) in the base directory
for example_dir in os.listdir(base_dir):
    example_path = os.path.join(base_dir, example_dir)
    
    # Ensure we're dealing with directories (i.e., each example is its own directory)
    if os.path.isdir(example_path):
        # Use glob to get all JSON files in the example directory
        json_files = glob(os.path.join(example_path, '*.json'))
        
        # Iterate over the JSON files
        for json_file in json_files:
            # Read the JSON data
            with open(json_file, 'r') as f:
                json_data = json.load(f)
            
            # Add the JSON data to the data list (can modify if additional info is needed)
            data.append(json_data)
            total_exmaples+=1
            if(total_exmaples>=100):
                print("breaking")
                is_broken = True
                break
    if(is_broken):
        break
# Convert the collected data into a pandas DataFrame
df = pd.DataFrame(data)

# Optional: Inspect the first few rows
#print(df.head())

breaking


In [5]:
focal_methods = df['focal_method']
code_bodies = pd.DataFrame({'code': focal_methods.apply(lambda x: x['body'])})

test_cases = df["test_case"]
test_code_bodies = pd.DataFrame({'tests': test_cases.apply(lambda x: x['body'])})

code_test_df = pd.concat([code_bodies, test_code_bodies], axis=1)

code_test_df['prompted_code'] = "Here is a method implementation in Java:\n\n" + code_test_df['code'] + "\n\nWrite a full test class with test cases to validate the method defined above."

In [6]:
dataset = Dataset.from_pandas(code_test_df)


## Testing Experiment (Input and Generation)

In [14]:
text_generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

def generate_responses_from_df(
    df: pd.DataFrame,
    text_column: str,
    text_generator,         # We pass the pipeline directly
    max_length: int = 100,
    output_csv: str = "generated_responses.csv",
    timing_file: str = "timing.txt"
):
    """
    Iterates through all rows in `df`, uses a text-generation pipeline to
    generate responses for each row's text, and stores the results in a CSV.
    Also logs total time to a text file.
    """
    dataset = Dataset.from_pandas(df)
    # 1) Start timing
    print("Starting Experiment")
    start_time = time.time()

    # 2) Generate responses

    def generate_text(batch, text_generator=None):
        prompts = batch["prompted_code"]
        outputs = text_generator(batch["prompted_code"])
        # If the pipeline returns a list of dict, handle that
        generated_texts = [o[0]["generated_text"] for o in outputs]
        return {"generated_text": generated_texts}

    generated_dataset = dataset.map(generate_text, 
                                    fn_kwargs= {"text_generator": text_generator}, 
                                    desc="Processing dataset", 
                                    batched=True, 
                                    batch_size=8,
                                    #num_proc=4,
    )

    # 3) End timing
    end_time = time.time()
    total_time = end_time - start_time

    # 4) Store and save results
    df = generated_dataset.to_pandas()
    df.to_csv(output_csv, index=False)

    with open(timing_file, "w") as f:
        f.write(f"Total generation time (seconds): {total_time}\n")

    print(f"Saved responses to {output_csv}. Total time: {total_time:.2f} seconds")

if __name__ == "__main__":
    # Use the pipeline you created earlier
    generate_responses_from_df(
        df=code_test_df,
        text_column="prompted_code",
        text_generator=text_generator,  # pipeline we created
        max_length=1000,  
        output_csv="my_generated_responses.csv",
        timing_file="my_timing_log.txt"
    )



Device set to use cuda:0
  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)


Starting Experiment


Processing dataset: 100%|██████████| 100/100 [07:02<00:00,  4.22s/ examples]


Saved responses to my_generated_responses.csv. Total time: 706.99 seconds


In [None]:
import time
import pandas as pd
from transformers import pipeline

def generate_responses_from_df(
    df: pd.DataFrame,
    text_column: str,
    model_name: str,
    device: int = -1,
    max_length: int = 100,
    output_csv: str = "generated_responses.csv",
    timing_file: str = "timing.txt"
):
    """
    1. Iterates through all rows in `df`.
    2. Uses a text-generation pipeline to generate responses for each row's text.
    3. Stores the responses and saves them to `output_csv`.
    4. Logs total time taken to `timing_file`.
    """

    # 1) Initialize text-generation pipeline
    text_generator = pipeline(
        "text-generation",
        model=model_name,
        device=device  # -1 = CPU; 0 or other ints = specific GPU
    )

    # 2) Start timing
    start_time = time.time()

    # 3) Iterate through the DataFrame, generate responses
    responses = []
    for i, row in df.iterrows():
        prompt_text = row[text_column]

        # Generate text (You can tune these kwargs—max_length, etc.)
        output = text_generator(prompt_text, max_length=max_length, num_return_sequences=1)
        
        # output is a list of dicts, e.g. [{'generated_text': "..."}]
        generated_text = output[0]["generated_text"]
        responses.append(generated_text)

    # End timing
    end_time = time.time()
    total_time = end_time - start_time

    # 4) Add responses to DataFrame and save
    df["response"] = responses
    df.to_csv(output_csv, index=False)

    # Save timing info
    with open(timing_file, "w") as f:
        f.write(f"Total generation time (seconds): {total_time}\n")

    print(f"Saved responses to {output_csv}.")
    print(f"Total time: {total_time:.2f} seconds")

# ------------------------
# Example usage
# ------------------------
if __name__ == "__main__":
    # Suppose your CSV has a column "prompt" with text to be processed
    df_input = pd.DataFrame({
        "prompt": [
            "Explain why the sky is blue.",
            "Write a short poem about cats.",
            "Summarize the benefits of exercise."
        ]
    })

    generate_responses_from_df(
        df=df_input,
        text_column="prompt",
        model_name="gpt2",      # or any other HF model, e.g. "gpt2-medium", "gpt-neo-125M", etc.
        device=-1,              # use CPU; set to 0 for first GPU, 1 for second GPU, etc.
        max_length=50,
        output_csv="my_generated_responses.csv",
        timing_file="my_timing_log.txt"
    )


## Testing Model Training (In case of potential use)

In [None]:
set_seed(0)
os.makedirs("../models/experiment_testing", exist_ok=True)

logging.set_verbosity_error()

# setup the trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        warmup_steps=100,
        max_steps=10000,
        #max_seq_length=512,
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        weight_decay=0.01,
        bf16=False,################Originally True
        logging_strategy="steps",
        logging_steps=10,
        output_dir="../models/experiment_testing",
        optim="paged_adamw_8bit",
        seed=0,
        run_name=f"train-deepseekcoder-v2-methods2test",
        report_to="wandb",
    ),
    peft_config=lora_config,
    #dataset_text_field="prompted_code",
)
# launch
print("Training...")
trainer.train()
print("Saving the last checkpoint of the model")
model.save_pretrained(os.path.join(args.output_dir, args.model_id + "_" + args.subset + "_final_checkpoint/"))
'''if args.push_to_hub:
    trainer.push_to_hub("Upload model", token=os.getenv("HUGGINGFACE_TOKEN")'''
print("Training Done! 💥")