In [11]:
import pandas as pd
# URL: https://csr.lanl.gov/data/cyber1/

In [12]:
src_auth_path = "raw_data/auth.txt"
src_dns_path = "raw_data/dns.txt"
src_flows_path = "raw_data/flows.txt"
src_proc_path = "raw_data/proc.txt"
src_redteam_path = "raw_data/redteam.txt"

dst_auth_path = "processed_data/cleaned/auth.csv"
dst_dns_path = "processed_data/cleaned/dns.csv"
dst_flows_path = "processed_data/cleaned/flows.csv"
dst_proc_path = "processed_data/cleaned/proc.csv"
dst_redteam_path = "processed_data/cleaned/redteam.csv"

train_final_path = "processed_data/cleaned/training_final.csv"
test_final_path = "processed_data/cleaned/testing_final.csv"

In [13]:
resolution = 1 # 1 second
day_seconds = 1*24*60*60
train = False

In [14]:
redteam_headers = ["time",
                   "user@domain",
                   "src_comp",
                   "dst_comp"]
redteam_df = pd.read_csv(src_redteam_path, header=None,names=redteam_headers)
redteam_df[['src_user', 'src_domain']] = redteam_df["user@domain"].str.split("@", expand=True)
redteam_df.drop(columns=['user@domain'],inplace=True)
redteam_df.replace("?","none",inplace=True)
redteam_df.to_csv(dst_redteam_path,index=False)

red_team_first_event = redteam_df["time"].min()
print(red_team_first_event)

150885


In [15]:
# Training Data Time target
test_time_target = red_team_first_event+day_seconds
print(f"Testing Data Time target: {test_time_target}")
chunk_size = 1_000_000

Testing Data Time target: 237285


In [16]:
auth_headers = [
    "time",
    "source_user@domain",
    "destination_user@domain",
    "src_comp",
    "dst_comp",
    "auth_type",
    "logon_type",
    "auth_orient",
    "pass_fail",
]

auth_df = pd.read_csv(src_auth_path, header=None, chunksize=chunk_size, names=auth_headers)
chunk_count = 0
header = True

for chunk in auth_df:
    # make sure time is int
    chunk["time"] = chunk["time"].astype(int)
    times = chunk["time"]

    if train:
        # only keep pre–first red team
        chunk = chunk[times < red_team_first_event]
        if chunk.empty:
            break
    else:
        # 1) If entire chunk is before the red-team start, skip it
        if times.max() < red_team_first_event:
            continue

        # 2) If the entire chunk is beyond your test cutoff, STOP reading
        if times.min() > test_time_target:
            break

        # 3) Now filter to the window you actually want to process
        chunk = chunk[(times >= red_team_first_event) & 
                      (times <= test_time_target)]
        if chunk.empty:
            continue

    # … process and write out chunk …
    chunk[['src_user', 'src_domain']] = chunk["source_user@domain"].str.split("@", expand=True)
    chunk[['dst_user', 'dst_domain']] = chunk["destination_user@domain"].str.split("@", expand=True)
    chunk.drop(columns=["source_user@domain", "destination_user@domain"], inplace=True)
    chunk['event_type'] = 'auth'
    chunk.replace("?", "none", inplace=True)

    chunk_count += 1
    print(f"Chunks Processed:  {chunk_count}")

    chunk.to_csv(dst_auth_path, mode='a', index=False, header=header)
    header = False


Chunks Processed:  1
Chunks Processed:  2
Chunks Processed:  3
Chunks Processed:  4
Chunks Processed:  5
Chunks Processed:  6
Chunks Processed:  7
Chunks Processed:  8
Chunks Processed:  9
Chunks Processed:  10
Chunks Processed:  11
Chunks Processed:  12
Chunks Processed:  13
Chunks Processed:  14
Chunks Processed:  15
Chunks Processed:  16
Chunks Processed:  17


In [17]:
dns_headers = ["time",
               "src_compr",
               "comp_rsvd"]
dns_df = pd.read_csv(src_dns_path, header=None,chunksize=chunk_size, names=dns_headers)
chunk_count = 0

header=True

for chunk in dns_df:
    # make sure time is int
    chunk["time"] = chunk["time"].astype(int)
    times = chunk["time"]

    if train:
        # only keep pre–first red team
        chunk = chunk[times < red_team_first_event]
        if chunk.empty:
            break
    else:
        # 1) If entire chunk is before the red-team start, skip it
        if times.max() < red_team_first_event:
            continue

        # 2) If the entire chunk is beyond your test cutoff, STOP reading
        if times.min() > test_time_target:
            break

        # 3) Now filter to the window you actually want to process
        chunk = chunk[(times >= red_team_first_event) & 
                      (times <= test_time_target)]
        if chunk.empty:
            continue
    chunk['event_type'] = 'dns'
    chunk.replace("?","none", inplace=True)
    chunk_count+=1
    print(f"Chunks Processed:  {chunk_count}")
    if header:
        chunk.to_csv(dst_dns_path, mode='a', index=False, header=True)
        header=False

Chunks Processed:  1


In [18]:
flows_headers = ["time",
                 "dur",
                 "src_comp",
                 "src_port",
                 "dst_comp",
                 "dst_port",
                 "prtcl",
                 "pckt_cnt",
                 "byte_cnt"]
flows_df = pd.read_csv(src_flows_path, header=None,chunksize=chunk_size, names=flows_headers)
chunk_count = 0

header=True

for chunk in flows_df:
    # make sure time is int
    chunk["time"] = chunk["time"].astype(int)
    times = chunk["time"]

    if train:
        # only keep pre–first red team
        chunk = chunk[times < red_team_first_event]
        if chunk.empty:
            break
    else:
        # 1) If entire chunk is before the red-team start, skip it
        if times.max() < red_team_first_event:
            continue

        # 2) If the entire chunk is beyond your test cutoff, STOP reading
        if times.min() > test_time_target:
            break

        # 3) Now filter to the window you actually want to process
        chunk = chunk[(times >= red_team_first_event) & 
                      (times <= test_time_target)]
        if chunk.empty:
            continue
    chunk['event_type'] = 'flow'
    chunk.replace("?","none", inplace=True)
    chunk_count+=1
    print(f"Chunks Processed:  {chunk_count}")
    if header:
        chunk.to_csv(dst_flows_path, mode='a', index=False, header=True)
        header=False

Chunks Processed:  1
Chunks Processed:  2
Chunks Processed:  3
Chunks Processed:  4
Chunks Processed:  5
Chunks Processed:  6
Chunks Processed:  7
Chunks Processed:  8
Chunks Processed:  9
Chunks Processed:  10


In [19]:
proc_headers = ["time",
                "user@domain",
                "src_comp",
                "proc_name",
                "start/end"]
proc_df = pd.read_csv(src_proc_path, header=None,chunksize=chunk_size, names=proc_headers)
chunk_count = 0

header=True

for chunk in proc_df:
    # make sure time is int
    chunk["time"] = chunk["time"].astype(int)
    times = chunk["time"]

    if train:
        # only keep pre–first red team
        chunk = chunk[times < red_team_first_event]
        if chunk.empty:
            break
    else:
        # 1) If entire chunk is before the red-team start, skip it
        if times.max() < red_team_first_event:
            continue

        # 2) If the entire chunk is beyond your test cutoff, STOP reading
        if times.min() > test_time_target:
            break

        # 3) Now filter to the window you actually want to process
        chunk = chunk[(times >= red_team_first_event) & 
                      (times <= test_time_target)]
        if chunk.empty:
            continue
    chunk[['src_user', 'src_domain']] = chunk["user@domain"].str.split("@", expand=True)
    chunk.drop(columns=["user@domain"], inplace=True)
    chunk['event_type'] = 'proc'
    chunk.replace("?","none", inplace=True)
    chunk_count+=1
    print(f"Chunks Processed:  {chunk_count}")
    if header:
        chunk.to_csv(dst_proc_path, mode='a', index=False, header=True)
        header=False

Chunks Processed:  1
Chunks Processed:  2
Chunks Processed:  3
Chunks Processed:  4
Chunks Processed:  5
Chunks Processed:  6
Chunks Processed:  7


In [20]:
auth = pd.read_csv(dst_auth_path,header=0)
dns = pd.read_csv(dst_dns_path,header=0)
flows = pd.read_csv(dst_flows_path,header=0)
procs = pd.read_csv(dst_proc_path,header=0)

final_df = pd.concat([auth,dns,flows,procs],ignore_index=True)
final_df[['dur', 'pckt_cnt', 'byte_cnt']] = final_df[['dur', 'pckt_cnt', 'byte_cnt']].fillna(0)
final_df = final_df.fillna("none").sort_values('time')
if train:
    final_df.to_csv(train_final_path,index=False)
else:
    final_df.to_csv(test_final_path,index=False)