In [None]:
import json
import re
import nltk
from nltk.corpus import words
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')
nltk.download('words')

def calculate_lexical_density(sentence):
    """
    Calculate lexical density as the ratio of content words to total words.
    """
    content_words = {"NN", "VB", "JJ", "RB"}  # Nouns, verbs, adjectives, adverbs
    tokens = word_tokenize(sentence)
    tagged_tokens = pos_tag(tokens)
    content_word_count = sum(1 for word, pos in tagged_tokens if pos[:2] in content_words)
    return content_word_count / len(tokens) if tokens else 0

def sliding_window_partition(events, window_size, step_size):
    """
    Partition keyboard events into overlapping sliding windows.
    """
    windows = []
    for start in range(0, len(events) - window_size + 1, step_size):
        windows.append(events[start:start + window_size])
    return windows

def preprocess_llm_free_to_buffalo_format(file_path, window_size=500, step_size=100):
    """
    Preprocess LLM-free data into a Buffalo-like format with sliding windows.
    """
    with open(file_path, 'r') as file:
        data = json.load(file)

    processed_data = []
    english_words = set(words.words())
    ignored_keys = {"Ctrl", "Alt", "Enter", "Tab"}  # Keys to ignore in sentences

    for user_id, actions in data.items():
        keyboard_data = actions.get("keyboard_data", [])
        windows = sliding_window_partition(keyboard_data, window_size, step_size)

        user_entry = {
            "user": user_id,
            "mode": 1,  # Mode information unavailable, default to 1
            "Number_of_windows": len(windows),
            "window_features": {}
        }

        for i, window in enumerate(windows):
            sentences = []
            shift_active = False  # Track if Shift is active
            word_speeds = []
            kht_dict = {}  # Key Hold Times as a dict
            kit_dict = {}  # Key Interval Times as a dict
            pause_durations = []
            start_time = None
            last_key_time = None
            last_key = None
            spelling_mistakes = 0
            word_count = 0
            pause_count = 0
            total_key_hold_time = 0
            backspace_count = 0

            current_word_start = None  # Track start time of the current word

            for action in window:
                action_type, key, timestamp, *_ = action

                # Key Hold Time (KHT)
                if action_type == "KD" and key not in ignored_keys:
                    if start_time is None:
                        start_time = timestamp  # Capture key-down time
                elif action_type == "KU" and start_time is not None:
                    key_hold_time = (timestamp - start_time) / 1000.0  # Key hold time in seconds
                    kht_dict.setdefault(key, []).append(key_hold_time)
                    total_key_hold_time += key_hold_time
                    start_time = None

                # Key Interval Time (KIT)
                if action_type == "KD" and last_key_time is not None and key not in ignored_keys:
                    key_interval = (timestamp - last_key_time) / 1000.0
                    if last_key not in ignored_keys:
                        kit_key = f"{last_key}->{key}"
                        kit_dict.setdefault(kit_key, []).append(key_interval)
                if action_type == "KD":
                    last_key_time = timestamp
                    last_key = key

                # Handle Shift Key
                if action_type == "KD" and key == "Shift":
                    shift_active = True
                    continue
                if action_type == "KU" and key == "Shift":
                    shift_active = False
                    continue

                # Handle Backspace
                if action_type == "KD" and key == "Backspace":
                    backspace_count += 1
                    if sentences:  # Remove last character if sentence is not empty
                        sentences.pop()
                    continue

                # Word construction and speed calculation
                if action_type == "KD" and key.isalnum() and key not in ignored_keys:
                    if current_word_start is None:
                        current_word_start = timestamp
                elif key == " " and current_word_start is not None:
                    word_duration = (timestamp - current_word_start) / 1000.0
                    word_speeds.append(word_duration)
                    current_word_start = None  # Reset for the next word

                # Sentence construction with Shift handling
                if action_type == "KD" and key.isprintable() and key not in ignored_keys:
                    if shift_active and key.isalpha():
                        sentences.append(key.upper())  # Capitalize if Shift is active
                    else:
                        sentences.append(key)

                # Spelling mistakes
                if key.isalnum() and key.lower() not in english_words:
                    spelling_mistakes += 1

            # Finalize features for the window
            sentence_text = ''.join(sentences)
            word_count = len(re.findall(r'\b\w+\b', sentence_text))
            lexical_density = calculate_lexical_density(sentence_text)
            average_word_length = sum(len(w) for w in re.findall(r'\b\w+\b', sentence_text)) / word_count if word_count else 0

            user_entry["window_features"][f"window_{i + 1}"] = {
                "sentence_number": i + 1,
                "sentence": sentence_text,
                "word_wise_speed": word_speeds,
                "key_hold_times": kht_dict,  # Dictionary of KHT
                "key_interval_times": kit_dict,  # Dictionary of KIT
                "number_of_words": word_count,
                "pause_list_sentence": pause_durations,
                "backspace_count_sentence": backspace_count,
                "average_word_length": average_word_length,
                "duration_between_pauses_sentence": pause_durations,
                "characters_before_first_pause": sum(1 for c in sentence_text if c != " ") if pause_count > 0 else 0,
                "words_before_first_pause": len(re.findall(r'\b\w+\b', sentence_text)) if pause_count > 0 else 0,
                "nouns_per_sentence": sum(1 for word, pos in pos_tag(word_tokenize(sentence_text)) if pos.startswith("NN")),
                "verbs_per_sentence": sum(1 for word, pos in pos_tag(word_tokenize(sentence_text)) if pos.startswith("VB")),
                "modifiers_per_sentence": sum(1 for word, pos in pos_tag(word_tokenize(sentence_text)) if pos in {"JJ", "RB"}),
                "modals_per_sentence": sum(1 for word, pos in pos_tag(word_tokenize(sentence_text)) if pos == "MD"),
                "lexical_density_of_the_sentence": lexical_density,
                "number_of_spelling_mistakes": spelling_mistakes,
                "number_of_special_characters_sentence": sum(not c.isalnum() for c in sentence_text),
                "number_of_printable_characters": len(sentence_text),
                "keystrokes_per_burst": total_key_hold_time,
                "pause_durations": pause_durations,
                "long_pauses": sum(1 for p in pause_durations if p > 1.0),  # Pauses longer than 1 second
            }

        processed_data.append(user_entry)

    return processed_data

# File path
llm_file = "/content/drive/MyDrive/Thesis-Cloud/Atharva/StonyBrook_json_format/Raw_Temp/Raw_Temp_Gay_Marriage_Fixed.json"

# Preprocess data to Buffalo-like format
llm_free_buffalo_format = preprocess_llm_free_to_buffalo_format(llm_file, window_size=1000, step_size=300)

# Save processed data
output_file = "processed_llm_free_buffalo_format_shift_backspace.json"
with open(output_file, 'w') as file:
    json.dump(llm_free_buffalo_format, file, indent=4)

print(f"Buffalo-like format preprocessing complete. File saved as {output_file}.")
