In [1]:
import json
import pandas as pd
import numpy as np

In [2]:
with open('data/nl2bash-data.json', 'r') as f:
    data = json.load(f)

In [3]:
df = pd.DataFrame.from_dict(data, orient='index')

In [4]:
df

Unnamed: 0,invocation,cmd
1,"Copy loadable kernel module ""mymodule.ko"" to t...",sudo cp mymodule.ko /lib/modules/$(uname -r)/k...
2,"Display all lines containing ""IP_MROUTE"" in th...",cat /boot/config-`uname -r` | grep IP_MROUTE
3,Display current running kernel's compile-time ...,cat /boot/config-`uname -r`
4,"Find all loadable modules for current kernel, ...",find /lib/modules/`uname -r` -regex .*perf.*
5,"Look for any instance of ""HIGHMEM"" in the curr...",grep “HIGHMEM” /boot/config-`uname -r`
...,...,...
10343,using exec in find command to dispaly the sear...,find . ... -exec cat {} \; -exec echo \;
10344,verbosely create intermediate directoriy tmp a...,mkdir -pv /tmp/boostinst
10345,view the manual page of find,man find
10346,"wait 2 seconds and then print ""hello""","echo ""hello `sleep 2 &`"""


In [5]:
df['command'] = df['cmd'].str.split().str[0]

distribution = df['command'].value_counts()

target_samples = 2000
num_unique_commands = len(distribution)
avg_samples_per_command = target_samples // num_unique_commands

balanced_data = []

for command, count in distribution.items():
    command_data = df[df['command'] == command]

    if count > avg_samples_per_command:
        command_data = command_data.sample(n=avg_samples_per_command, random_state=42)
    elif count < avg_samples_per_command:
        if count == 1:
            command_data = command_data
        else:
            command_data = command_data.sample(n=avg_samples_per_command, replace=True, random_state=42)

    balanced_data.append(command_data)

balanced_df = pd.concat(balanced_data, ignore_index=True)

print(f"Total samples in the result: {len(balanced_df)}")
print(balanced_df)

balanced_distribution = balanced_df['command'].value_counts()
print(balanced_distribution)

Total samples in the result: 831
                                            invocation  \
0    Find all files in the current directory tree w...   
1    Search for first match of the regex 're' in al...   
2    Search the current directory tree for files an...   
3    list any files modified since /bin/sh was last...   
4    Remove filetype suffix (last dot and following...   
..                                                 ...   
826  Prints real path of the folder containing $0 f...   
827  Append the current date in '%Y%m%d_%H%M' forma...   
828  Records the number of occurences of 'needle' i...   
829  Recursively finds all 'STATUS.txt' files conta...   
830  Save a line of 100 random characters either "....   

                                                   cmd             command  
0                find . -size +1M -exec mv {} files \+                find  
1    find . -name '*.coffee' -exec awk '/re/ {print...                find  
2                             find .  -

In [8]:
test_data = []
train_data = []

In [9]:
remaining_df = df.loc[~df.index.isin(balanced_df.index)]

for index, row in remaining_df.iterrows():
    train_data.append({'value': row['invocation'], 'target': 1})
    train_data.append({'value': row['cmd'], 'target': 0})

In [10]:
for index, row in balanced_df.iterrows():
    if np.random.rand() < 0.6:
        test_data.append({'value': row['invocation'], 'target': 1})
        test_data.append({'value': row['cmd'], 'target': 0})
    else:
        if np.random.rand() < 0.1:
            test_data.append({'value': row['invocation'], 'target': 1})
            train_data.append({'value': row['cmd'], 'target': 0})
        else:
            train_data.append({'value': row['invocation'], 'target': 1})
            test_data.append({'value': row['cmd'], 'target': 0})

In [11]:
len(test_data)

1313

In [12]:
len(train_data)

21043

In [13]:
test_data = pd.DataFrame(test_data)
train_data = pd.DataFrame(train_data)

In [14]:
synt = pd.read_csv('data/synthetic_data.csv')

In [15]:
len(synt)

375

In [16]:
import os
import json
import pandas as pd

base_dir = "data/cybersec"


data_list = []

for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith('.json'):
            file_path = os.path.join(root, file)
            try:
                with open(file_path, 'r') as f:
                    for line in f:
                        if line.strip():
                            json_data = json.loads(line.strip())
                            json_data['file_path'] = file_path
                            data_list.append(json_data)
            except Exception as e:
                print(f"Error reading {file_path}: {e}")

combined_df = pd.DataFrame(data_list)

In [17]:
combined_df= combined_df[combined_df['cmd_type']=='bash-command']

In [18]:
len(combined_df['cmd'].unique())

3820

In [19]:
new_df = pd.DataFrame(combined_df['cmd'].unique())

In [21]:
new_df['value'] = new_df[0]

In [22]:
new_df['target'] = 0

In [25]:
result = pd.concat([test_data, synt, new_df])

In [29]:
result = result[['value', 'target']]

In [32]:
def modify_case(row):
    value = str(row['value']).strip()
    if row['target'] == 1 and value:
        if np.random.rand() < 0.6:
            return value[0].lower() + value[1:]
        else:
            return value[0].upper() + value[1:]
    return value

result = result.dropna(subset=['value'])
result = result[result['value'].str.strip() != ""]

train_df = train_data.dropna(subset=['value'])
train_df = train_df[train_df['value'].str.strip() != ""]

result['value'] = result.apply(modify_case, axis=1)

train_df['value'] = train_df.apply(modify_case, axis=1)

5508
5133
5133


In [37]:
target_1_count = train_df[train_df['target'] == 1].shape[0]
target_0_count = train_df[train_df['target'] == 0].shape[0]

desired_target_1_count = target_0_count // 9  # 1:9 ratio

if target_1_count > desired_target_1_count:
    target_1_rows = train_df[train_df['target'] == 1].sample(n=desired_target_1_count, random_state=42)
else:
    target_0_rows = train_df[train_df['target'] == 0]

target_0_rows = train_df[train_df['target'] == 0]

balanced_train_df = pd.concat([target_0_rows, target_1_rows], ignore_index=True)
balanced_train_df = balanced_train_df.sample(frac=1, random_state=42).reset_index(drop=True)

print(balanced_train_df['target'].value_counts())

target
0    10376
1     1152
Name: count, dtype: int64


In [39]:
balanced_train_df.to_csv('data/train.csv', index=False)

In [40]:
result.to_csv('data/test.csv', index=False)