In [1]:
import numpy as np
import os
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
import streamlit as st

def extract_syscall_tokens(file_contents):
    """
    Extracts syscall tokens from a system log file.
    Only lines with "SYSCALL" are used.
    """
    tokens = []
    for line in file_contents.splitlines():
        line = line.strip()
        if not line:
            continue
        parts = line.split(',')
        if len(parts) < 3:
            continue
        if parts[1].strip().upper() == "SYSCALL":
            tokens.append(parts[2].strip())
    return tokens

def load_training_data(filename):
    """
    Reads the file and returns the entire list of syscall tokens.
    """
    with open(filename, 'r') as f:
        contents = f.read()
    return extract_syscall_tokens(contents)

TRAINING_FILE = "001_NORMAL_Flight.txt"
if not os.path.exists(TRAINING_FILE):
    st.error(f"Training file '{TRAINING_FILE}' not found!")
    st.stop()

syscall_tokens = load_training_data(TRAINING_FILE)
st.write("Total SYSCALL tokens extracted:", len(syscall_tokens))

# Tokenize the entire token list
tokenizer = Tokenizer(lower=False, split=' ')
tokenizer.fit_on_texts([" ".join(syscall_tokens)])
vocab_size = len(tokenizer.word_index) + 1
st.write("Vocabulary size:", vocab_size)

# Instead of generating all subsequences, use a fixed-length sliding window.
window_size = 20  # You can adjust this parameter
step_size = 1    # You can increase this to reduce the number of samples

all_tokens = tokenizer.texts_to_sequences([" ".join(syscall_tokens)])[0]
input_sequences = []
labels = []
for i in range(window_size, len(all_tokens), step_size):
    input_sequences.append(all_tokens[i-window_size:i])
    labels.append(all_tokens[i])

# No need to pad if each sequence is exactly window_size length.
# But in case the last sequence is shorter, pad it.
input_sequences = pad_sequences(input_sequences, maxlen=window_size, padding='pre')
labels = to_categorical(labels, num_classes=vocab_size)

st.write("Number of training samples:", len(input_sequences))

2025-04-22 19:09:59.669 
  command:

    streamlit run C:\Users\ASUS\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


In [20]:
# Section 2: Model Building and Training
# =============================================================================
MODEL_PATH = "syscall_lstm_model.h5"
EMBEDDING_DIM = 10

def build_model(vocab_size, max_seq_length, embedding_dim):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_length))
    model.add(Masking(mask_value=0.0))
    model.add(LSTM(128))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

if os.path.exists(MODEL_PATH):
    model = load_model(MODEL_PATH)
    st.write("Loaded pre-trained model.")
else:
    st.write("Training LSTM model... (this may take a while)")
    model = build_model(vocab_size, max_seq_length, EMBEDDING_DIM)
    model.fit(input_sequences, labels, epochs=5, batch_size=64, verbose=1)
    model.save(MODEL_PATH)
    st.write("Model trained and saved.")




In [18]:
# Section 3: Optimization Engine Functions

cache = {}  # Global cache for predictive caching

def predictive_cache(syscall):
    """
    Simulate execution and caching of a system call.
    """
    if syscall in cache:
        st.write(f"[Cache Hit] {syscall}: {cache[syscall]}")
        return cache[syscall]
    else:
        result = f"Executed {syscall}"
        cache[syscall] = result
        st.write(f"[Caching] {syscall}")
        return result

def adaptive_reorder(sequence, model, tokenizer, max_seq_length, threshold=0.6):
    """
    Use the LSTM model to predict the next syscall.
    If the prediction probability exceeds the threshold, prefetch that syscall.
    """
    token_seq = tokenizer.texts_to_sequences([" ".join(sequence)])[0]
    token_seq = pad_sequences([token_seq], maxlen=max_seq_length, padding='pre')
    prediction = model.predict(token_seq, verbose=0)
    predicted_token = np.argmax(prediction, axis=1)[0]
    predicted_prob = np.max(prediction)
    inv_map = {v: k for k, v in tokenizer.word_index.items()}
    predicted_call = inv_map.get(predicted_token, None)
    if predicted_call and predicted_prob >= threshold:
        st.write(f"[Prefetch] Predicted '{predicted_call}' with probability {predicted_prob:.2f}")
        predictive_cache(predicted_call)
    return sequence

def batch_process(sequence):
    """
    Group consecutive identical syscalls.
    Returns a list of tuples: (syscall, count)
    """
    batched = []
    i = 0
    while i < len(sequence):
        count = 1
        while i + 1 < len(sequence) and sequence[i+1] == sequence[i]:
            count += 1
            i += 1
        batched.append((sequence[i], count))
        i += 1
    return batched

def process_sequence(sequence, model, tokenizer, max_seq_length):
    """
    Process a given syscall sequence:
      - Apply adaptive reordering (prefetch next call).
      - Batch process consecutive calls.
      - Execute each call via predictive caching.
    """
    st.write("=== Processing Sequence ===")
    st.write("Original sequence:", sequence)
    sequence = adaptive_reorder(sequence, model, tokenizer, max_seq_length)
    st.write("After adaptive reordering:", sequence)
    batched = batch_process(sequence)
    st.write("Batched sequence:", batched)
    results = []
    for syscall, count in batched:
        result = predictive_cache(syscall)
        results.extend([result] * count)
    st.write("Execution results:", results)
    return results

In [19]:
# Section 4: Streamlit User Interface
# =============================================================================
st.title("AI-Powered System Call Optimizer")
st.markdown("""
This application uses an LSTM-based model to learn patterns in system call logs.
You can either upload a new log file (with the same format) or enter a custom
system call sequence (syscall codes only, space-separated).
""")

# Option to upload a new system log file
uploaded_file = st.file_uploader("Upload a system log file", type=["txt"])
if uploaded_file is not None:
    file_contents = uploaded_file.getvalue().decode("utf-8")
    # Extract tokens from the uploaded file
    new_tokens = extract_syscall_tokens(file_contents)
    st.write("Uploaded file contains", len(new_tokens), "SYSCALL tokens.")
    # Display the first 20 tokens for reference
    st.write("First 20 tokens:", new_tokens[:20])
    # Process the entire sequence as one session
    st.markdown("### Processing Uploaded System Log")
    process_sequence(new_tokens, model, tokenizer, max_seq_length)

# Or, allow manual input of a system call sequence
manual_input = st.text_input("Or enter a custom system call sequence (space-separated)", "0x53 0xe7 0xf 0x10")
if st.button("Optimize Manual Sequence"):
    user_seq = manual_input.split()
    st.markdown("### Processing Manual Sequence")
    process_sequence(user_seq, model, tokenizer, max_seq_length)



In [15]:
streamlit run app.py


SyntaxError: invalid syntax (507122745.py, line 1)