# Project Command Reference
A curated list of all key commands used in the project, organized step-by-step.

## 1. Environment Setup

In [None]:
# ## 1. Environment Setup
# Prepend src/ folder to PYTHONPATH
import os
os.environ['PYTHONPATH'] = os.path.join(os.getcwd(), 'src') + ':' + os.environ.get('PYTHONPATH', '')

In [None]:
# ## 1. Environment Setup
# Request a GPU interactive session on Wulver cluster
srun -p gpu -n 1 --ntasks-per-node=2 --qos=low --account=phan --mem-per-cpu=64G --gres=gpu:1 --time=72:00:00 --pty bash
# Load Python module for this session
module load foss/2022b Python/3.10.8
# Navigate to your project directory
cd /project/phan/ks2345 


## 2. vLLM Server Launch

In [None]:
python3 -m vllm.entrypoints.openai.api_server \
  --model /project/phan/codellama/StarCoder \
  --port 8000

## 2.1 Jupyter Notebook on Wulver
Create and submit a SLURM job script to launch Jupyter Lab on the GPU node.

In [None]:
# ## 2.1 Jupyter Notebook on Wulver
%%bash
# SLURM batch script for Jupyter Lab on Wulver
# Save this script as run_jupyter.sh and submit with `sbatch run_jupyter.sh`
#SBATCH -J jupyter_lab              # Job name
#SBATCH -o output.out              # Standard output log
#SBATCH -p gpu                     # GPU queue
#SBATCH --gres=gpu:1               # Request 1 GPU
#SBATCH -c 16                      # Request 16 CPUs
#SBATCH --mem=64G                  # Request 64GB memory
#SBATCH --qos=low                  # Quality of Service
#SBATCH --account=phan             # NJIT account
#SBATCH --time=2-23:59:00          # Max run time (2 days)
#SBATCH --mail-type=BEGIN,END,FAIL # Email notifications
#SBATCH --mail-user=ks2345@njit.edu  # Replace with your email
module load foss/2022b Python/3.10.8
# Activate your virtual environment (adjust path as needed)
source ~/myenv/bin/activate
# Launch Jupyter Lab
jupyter-lab --ip=0.0.0.0 --port=8000 --no-browser
# After job starts, forward port locally:
# ssh -NL 8000:$(hostname):8000 <ucid>@wulver.njit.edu


In [None]:
# ## 2.1 Jupyter Notebook on Wulver
!ssh -NL 8000:localhost:8000 ks2345@login02.tartan.njit.edu

In [None]:
# ## 2.1 Jupyter Notebook on Wulver
import os
os.environ["OPENAI_API_KEY"] = "EMPTY"
os.environ["OPENAI_BASE_URL"] = "http://0.0.0.0:8000/v1/"


## 3. Seed Gathering (S→C)

In [None]:
# ## 3. Seed Gathering (S→C)
# THIS CELL IS TO ADD SEED NUMBERS TO THE DATASET OBTAINED FROM STEP 1

from datasets import load_from_disk

ds = load_from_disk("/project/phan/ks2345/java_step1_output/Seed3")


def split_method(example):
    content = example["content"]
    # 1. Locate the end of the method signature by finding "){"
    sig_end = content.find("){")
    if sig_end == -1:
        # Fallback: find the first brace *after* the closing parenthesis of the signature
        close_paren = content.find(")")
        sig_end = content.find("{", close_paren)
    # Include the ")" in signature
    signature = content[:sig_end+1].strip()
    # Everything from the brace onward is the body
    body = content[sig_end+1:].strip()
    return {"instruction": signature, "response": body}

# Apply and clean up columns
mapped = ds.map(split_method, remove_columns=[c for c in ds.column_names if c != "content"])
final = mapped.remove_columns(["content"])

# 1. Add a 'seed' field using the example index
def add_seed(example, idx):
    example["seed"] = idx
    return example

final_with_seed = final.map(
    add_seed,
    with_indices=True,
    remove_columns=[]
)

# Preview
print(final[0])


In [None]:
# ## 3. Seed Gathering (S→C)
# 2. Export final_with_seed to seeds.jsonl
final_with_seed.to_json("/project/phan/ks2345/seeds.jsonl")

# 3. Verify the first two lines
!head -n 2 /project/phan/ks2345/seeds.jsonl


In [None]:
# ## 3. Seed Gathering (S→C)
# CODE FOR S --> C STEP

!python self_ossinstruct.py \
  --use_vllm_server True \
  --instruct_mode "S->C" \
  --seed_data_files seeds.jsonl \
  --max_new_data 1151 \
  --model "/project/phan/codellama/StarCoder" \
  --temperature 0.7 \
  --num_fewshots 8 \
  --num_batched_requests 1 \
  --async_micro_batch_size 1 \
  --num_sample_per_request 1 \
  --sleep 1 \
  --delay 1 \
  --tag concept_gen \
  --save_dir concepts_dataset


## 4. Concept→Instruction (C→I)

In [None]:
# ## 4. Concept→Instruction (C→I)
#CODE FOR BUILDING A CLEANED AND LANGUAGUE BASED DATASET FOR THE C --> I STEP.

from datasets import load_dataset, Dataset

# 1. Load the raw S→C output
ds = load_dataset(
    "json",
    data_files="concepts_dataset/data-concept_gen-*.jsonl",
    split="train"
)

clean = []
for ex in ds:
    concepts = ex.get("concepts", [])
    # skip bad entries
    if not concepts or any(("/" in c or "import " in c or "\n" in c) for c in concepts):
        continue
    clean.append({
        "concepts": concepts,
        "seed": ex["seed"],
        "language": "Java",
        "category": "function implementation",
        "difficulty": "medium"
    })

# 2. Build a new Dataset and export it
ds_clean = Dataset.from_list(clean)
ds_clean.to_json("concepts_for_CI.jsonl")

# 3. Verify you now have valid JSON lines:
!head -n 3 concepts_for_CI.jsonl


In [None]:
# ## 4. Concept→Instruction (C→I)
# CODE FOR C --> I STEP

!python self_ossinstruct.py \
  --use_vllm_server True \
  --instruct_mode "C->I" \
  --seed_data_files concepts_for_CI.jsonl \
  --max_new_data 1128 \
  --model "/project/phan/codellama/StarCoder" \
  --temperature 0.7 \
  --num_fewshots 2 \
  --num_batched_requests 1 \
  --async_micro_batch_size 1 \
  --num_sample_per_request 1 \
  --sleep 1 \
  --delay 1 \
  --tag instruction_gen \
  --save_dir instructions_dataset

## 5. Instruction→Response (I→R)

In [None]:
# ## 5. Instruction→Response (I→R)
# ADDING SEED NUMBERS

from datasets import load_dataset, Dataset

# Load your C→I output
ds_ins = load_dataset(
    "json",
    data_files="instructions_dataset/data-instruction_gen-c_i-9718a-0-20250512_002637.jsonl",
    split="train"
)

# Add a seed field equal to the example index if missing
def add_seed(example, idx):
    example["seed"] = example.get("seed", idx)
    return example

ds_ins2 = ds_ins.map(
    add_seed,
    with_indices=True,
    remove_columns=[]  # keep all other fields (input, output)
)

# Export to new JSONL
ds_ins2.to_json("instructions_with_seed.jsonl")

# Verify
!head -n 3 instructions_with_seed.jsonl


In [None]:
# ## 5. Instruction→Response (I→R)
# CODE FOR I --> R STEP

!python self_ossinstruct.py \
  --use_vllm_server True \
  --instruct_mode "I->R" \
  --seed_data_files instructions_with_seed.jsonl \
  --max_new_data 1128 \
  --model "/project/phan/codellama/StarCoder" \
  --temperature 0.7 \
  --num_fewshots 1 \
  --num_batched_requests 1 \
  --async_micro_batch_size 1 \
  --num_sample_per_request 1 \
  --sleep 1 \
  --delay 1 \
  --tag response_gen \
  --save_dir responses_dataset

## 6. Java Installation (macOS)

In [None]:
# ## 6. Java Installation (macOS)
%%bash
brew update
brew install openjdk@11

# Set environment
export JAVA_HOME="$(brew --prefix openjdk@11)/libexec/openjdk.jdk/Contents/Home"
export PATH="$JAVA_HOME/bin:$PATH"

# Verify
javac -version
java -version

## 7. Java Installation (Cluster)

In [None]:
# ## 7. Java Installation (Cluster)
%%bash
module load Java/11.0.16
which javac
javac -version
java -version

## 8. Self-Validation Sampling

In [None]:
# ## 8. Self-Validation Sampling
# SELF VALIDATION CODE, RAN ON 30 SAMPLES 

import json, os, re, subprocess, tempfile

def extract_java_blocks(response):
    """Extract solution and test code from a response string."""
    solution = re.search(r"```java\s+(.*?)```", response, re.DOTALL)
    solution_code = solution.group(1).strip() if solution else ""

    test_code = ""
    if "<tests>" in response:
        test_section = response[response.find("<tests>"):]
        test = re.search(r"```java\s+(.*?)```", test_section, re.DOTALL)
        test_code = test.group(1).strip() if test else ""

    return solution_code, test_code

def get_public_class_name(code: str) -> str:
    """Extracts the name of the public class from Java code."""
    match = re.search(r'public\s+class\s+(\w+)', code)
    return match.group(1) if match else "Main"

def write_java_files(solution_code: str, test_code: str, dir_path: str):
    """Writes Java code to correctly named .java files based on class names."""
    sol_class = get_public_class_name(solution_code)
    test_class = get_public_class_name(test_code)

    sol_path = os.path.join(dir_path, f"{sol_class}.java")
    test_path = os.path.join(dir_path, f"{test_class}.java")

    with open(sol_path, 'w') as f: f.write(solution_code)
    with open(test_path, 'w') as f: f.write(test_code)

    return sol_path, test_path, test_class

def validate_java_response(response: str):
    """Validates one response with Java code and tests."""
    sol_code, test_code = extract_java_blocks(response)
    if not sol_code or not test_code:
        return {"valid": False, "reason": "Missing solution or test block"}

    with tempfile.TemporaryDirectory() as tmp:
        sol_path, test_path, test_class = write_java_files(sol_code, test_code, tmp)

        try:
            subprocess.run(["javac", sol_path, test_path], check=True, capture_output=True)
            result = subprocess.run(["java", "-cp", tmp, test_class], capture_output=True, timeout=5)
            return {"valid": True, "output": result.stdout.decode()}
        except subprocess.CalledProcessError as e:
            return {"valid": False, "reason": e.stderr.decode()}
        except subprocess.TimeoutExpired:
            return {"valid": False, "reason": "Timeout"}

def validate_jsonl(filepath, max_entries=50, only_valid=True):
    """Validates multiple entries from a JSONL file."""
    results = []
    with open(filepath, 'r') as f:
        for i, line in enumerate(f):
            if i >= max_entries:
                break
            data = json.loads(line)
            result = validate_java_response(data.get("response", ""))
            if only_valid and not result["valid"]:
                continue
            results.append({
                "index": i,
                "instruction": data.get("instruction", ""),
                "response": data.get("response", ""),
                "output": result.get("output", "") if result["valid"] else "",
                "error": result.get("reason", "") if not result["valid"] else ""
            })
    return results

if __name__ == "__main__":
    input_file = "/Users/kunalsharma/Desktop/ks_download/responses_dataset/data-response_gen-i_r-84fa2-0-20250512_211602.jsonl"
    results = validate_jsonl(input_file, max_entries=50, only_valid=True)

    print(f"\n✅ {len(results)} valid Java responses found:\n")
    for r in results:
        print(f"[{r['index']}] Instruction:\n{r['instruction'][:30]}...\n")
        print(f"Output:\n{r['output']}\n{'-'*60}")
