# Fine tuning Gemma 2 2b for Resume Parsing

## Converting JSON to JSONL 

In [1]:
import json

# Let's assume your original data is in a file "resume_data.json"
# which looks like: [{"Filename": "...", "Question": "...", "Answer": {...}}, ...]

INPUT_FILE = "output.json"
OUTPUT_FILE = "resume_parser_instruct.jsonl"

# The generic instruction
BASE_INSTRUCTION = (
    '''You are a helpful assistant that extracts structured data from the given resume text.

Important Instructions:
1. Output Format: Return only a single JSON object that strictly follows the requested structure.
2. No Extra Text: Do not include any additional text, explanations, code fences, triple backticks, or any formatting beyond the JSON object.
3. No Missing Keys: Include all keys listed below, even if their values are empty or blank.
4. No Trailing Commas: Ensure that there are no trailing commas after the last item in arrays or objects.
5. Data Structure:
   - name: string
   - location: string
   - email: string
   - phone: string
   - linkedin: string
   - skills: an array of strings
   - experience: an array of objects, each with keys "role", "company", "location", "start_date", "end_date", "description"
   - projects: an array of objects, each with keys "title", "start_date", "end_date", "description", "tech_stack" (where "tech_stack" is an array of strings)
   - education: an array of objects, each with keys "degree", "institution", "start_date", "end_date", "gpa"
   - extracurricular_activities: an array of objects, each with keys "activity" and "description"
6. Strictly follow the structure in step 5. Do not create new keys by yourself. Use only the keys I mentioned in step 5. 
7. You are part of a resume parsing pipeline so it's really important you return a json only object and again. Strictly follow the key names in step 5. 
If a field is not found in the resume, return an empty string "" for strings or an empty array [] for lists.'''

)

def convert_to_instruct_format(original_item):
    """
    original_item is a dict with keys: ['Filename', 'Question', 'Answer']
    We'll build a new dict: {'instruction': ..., 'input': ..., 'output': ...}
    """
    # 1. Build the instruction
    instruction = BASE_INSTRUCTION

    # 2. The input is the resume text from the "Question" field
    input_text = original_item.get("Question", "")

    # 3. Convert the 'Answer' dictionary to a JSON string
    #    so we can store it in "output" as text.
    answer_dict = original_item.get("Answer", {})
    # Dump to a single-line JSON string
    output_text = json.dumps(answer_dict, ensure_ascii=False)

    # Construct the final record
    new_item = {
        "instruction": instruction,
        "input": input_text,
        "output": output_text
    }
    return new_item

def main():
    # 1. Read original data
    with open(INPUT_FILE, "r", encoding="utf-8") as infile:
        data = json.load(infile)  # data should be a list of dicts

    # 2. Transform each item
    transformed_data = []
    for item in data:
        transformed_item = convert_to_instruct_format(item)
        transformed_data.append(transformed_item)

    # 3. Write new data to JSONL
    with open(OUTPUT_FILE, "w", encoding="utf-8") as outfile:
        for td in transformed_data:
            # Dump each record as a single line in JSON
            json_line = json.dumps(td, ensure_ascii=False)
            outfile.write(json_line + "\n")

    print(f"Converted {len(transformed_data)} records to {OUTPUT_FILE} in instruct format.")

if __name__ == "__main__":
    main()


Converted 153 records to resume_parser_instruct.jsonl in instruct format.
