In [2]:
import json
import tiktoken # for token counting
import numpy as np
from collections import defaultdict
import csv
import sys
import os

In [3]:
def csv_to_jsonl(csv_file_path, jsonl_file_path):
    """
    Converts a CSV file containing emails and labels to a JSONL file for LLM fine-tuning.

    Args:
        csv_file_path (str): Path to the input CSV file.
        jsonl_file_path (str): Path to the output JSONL file.
    """
    # Define the system prompt
    system_prompt = "You are an AI assistant specialized in email security. Determine whether an email is 'phishing' or 'safe' based on its content."

    with open(csv_file_path, 'r', encoding='utf-8') as csv_file, \
         open(jsonl_file_path, 'w', encoding='utf-8') as jsonl_file:

        reader = csv.DictReader(csv_file)
        for row in reader:
            email_body = row['body'].strip()
            label = row['label'].strip()

            # Map label to 'phishing' or 'safe'
            if label == '1':
                assistant_response = 'phishing'
            elif label == '0':
                assistant_response = 'safe'
            else:
                # Skip if label is invalid
                continue

            # Create the conversation
            conversation = {
                "messages": [
                    {
                        "role": "system",
                        "content": system_prompt
                    },
                    {
                        "role": "user",
                        "content": email_body
                    },
                    {
                        "role": "assistant",
                        "content": assistant_response
                    }
                ]
            }

            # Write the conversation as a JSON line
            jsonl_file.write(json.dumps(conversation) + '\n')

    print(f"Conversion complete. Output saved to {jsonl_file_path}")

In [6]:
csv.field_size_limit(sys.maxsize)
csv_file_path = '/Users/youssefchouay/Programming/Image-Processing-and-NLP-for-Brand-Protection/data/local/phishing_train_2.csv'       # Replace with your CSV file path
jsonl_file_path = '/Users/youssefchouay/Programming/Image-Processing-and-NLP-for-Brand-Protection/data/local/phishing_train_2.jsonl'   # Replace with your desired JSONL output file path

csv_to_jsonl(csv_file_path, jsonl_file_path)

Conversion complete. Output saved to /Users/youssefchouay/Programming/Image-Processing-and-NLP-for-Brand-Protection/data/local/phishing_train_2.jsonl


In [7]:
# Combine both jsonl files
jsonl_file_path_1 = '/Users/youssefchouay/Programming/Image-Processing-and-NLP-for-Brand-Protection/data/local/phishing_train_1.jsonl'
jsonl_file_path_2 = '/Users/youssefchouay/Programming/Image-Processing-and-NLP-for-Brand-Protection/data/local/phishing_train_2.jsonl'
combined_jsonl_file_path = '/Users/youssefchouay/Programming/Image-Processing-and-NLP-for-Brand-Protection/data/local/phishing_train.jsonl'

with open(jsonl_file_path_1, 'r', encoding='utf-8') as jsonl_file_1, \
      open(jsonl_file_path_2, 'r', encoding='utf-8') as jsonl_file_2, \
      open(combined_jsonl_file_path, 'w', encoding='utf-8') as combined_jsonl_file:
    for line in jsonl_file_1:
        combined_jsonl_file.write(line)
    for line in jsonl_file_2:
        combined_jsonl_file.write(line)

print(f"Combining complete. Output saved to {combined_jsonl_file_path}")

Combining complete. Output saved to /Users/youssefchouay/Programming/Image-Processing-and-NLP-for-Brand-Protection/data/local/phishing_train.jsonl
