In [3]:
import os
import subprocess

In [None]:
# 将pcap包以五元组转化为会话

def split_cap2flow(pcap_file, flow_dir):
    # 检查输入文件是否存在
    if not os.path.exists(pcap_file):
        print(f"Error: {pcap_file} not found.")
        return -1

    # 确保输出目录存在
    os.makedirs(flow_dir, exist_ok=True)

    cmd = f"D:/Users/ZitaGo/Downloads/SplitCap.exe -r {pcap_file} -s session -o {flow_dir}"
    print(f'Running command: {cmd}')
    try:
        # 执行命令并捕获输出
        result = subprocess.run(cmd, shell=True, text=True, capture_output=True)
        print("STDOUT:", result.stdout)
        print("STDERR:", result.stderr)
        
        if result.returncode != 0:
            print("Error: Command failed to execute.")
            return -1
    except Exception as e:
        print(f"Error: {e}")
        return -1
    # os.system(cmd)
    return 0

In [5]:
# 统计数据流中每个包的情况(payload长度为0表示无效包，全为无效包的流为无效流)

from scapy.all import PcapReader

def flow_statistic(flow_dir):
    # 总共的流数量
    flow_num = 0

    # 有效流的数量
    valid_num = 0

    # 无效的流数量
    invalid_num = 0

    # 有效流的平均包数量
    pkt_num_per_validflow = 0

    # 无效流的平均包数量
    pkt_num_per_invalidflow = 0

    # 有效流中包长不为0的平均包数量占比
    validpkt_rate_per_validflow = 0

    # 有效流中包长不为0的平均包数量
    validpkt_num_per_validflow = 0

    # 有效流中包长不为0的payload平均字节长
    validpkt_len_per_validflow = 0

    # 遍历一个文件夹的所有文件
    for root, dirs, files in os.walk(flow_dir):
        for file in files:
            flow_num += 1
            if (flow_num+1) % 1000 == 0:
                print(f"Processed {flow_num+1} flows.")

            pkt_num_in_flow = 0
            validpkt_num_in_validflow = 0
            validpkt_len_in_validflow = 0

            # 表示是正常tcp包
            flag = 0

            # 打开每一个流并统计每一包的情况
            with PcapReader(flow_dir+'\\'+file) as s1:
                for pkt in s1:
            # s1 = PcapReader(flow_dir+'\\'+file)
            # while True:
                # pkt = s1.read_packet()
                    if not pkt:
                        flag = 1
                        break
                    if not pkt.haslayer('TCP'):
                        flag = 1
                        break
                    
                    ip_packet = pkt.payload
                    icmp_packet = ip_packet.payload
                    payload = icmp_packet.payload
                    original_payload = payload.original
                    hex_payload = original_payload.hex()

                    pkt_num_in_flow += 1
                    if len(hex_payload) == 0:
                        continue
                    else:
                        validpkt_num_in_validflow += 1
                        validpkt_len_in_validflow += len(hex_payload)//2
            
            # 会话流中含有不是tcp的包
            if flag:
                continue
            
            if validpkt_num_in_validflow == 0:
                invalid_num += 1
                pkt_num_per_invalidflow += pkt_num_in_flow
            else:
                valid_num += 1
                pkt_num_per_validflow += pkt_num_in_flow
                validpkt_num_per_validflow += validpkt_num_in_validflow
                validpkt_rate_per_validflow += validpkt_num_in_validflow / pkt_num_in_flow
                validpkt_len_per_validflow += validpkt_len_in_validflow
    
    flow_len_per_validflow = validpkt_len_per_validflow / valid_num
    validpkt_len_per_validflow /= validpkt_num_per_validflow
    pkt_num_per_validflow /= valid_num
    pkt_num_per_invalidflow /= invalid_num
    validpkt_num_per_validflow /= valid_num
    validpkt_rate_per_validflow /= valid_num
    # validpkt_len_per_validflow /= valid_num

    print(f"Total flow number: {flow_num}")
    print(f"Valid flow number: {valid_num}")
    print(f"Invalid flow number: {invalid_num}")
    print(f"Average packet number per valid flow: {pkt_num_per_validflow:.2f}")
    print(f"Average packet number per invalid flow: {pkt_num_per_invalidflow:.2f}")
    print(f"Valid packet rate per valid flow: {validpkt_rate_per_validflow:.2%}")
    print(f"Average packet number per valid flow with valid payload: {validpkt_num_per_validflow:.2f}")
    print(f"Average payload length per valid packet: {validpkt_len_per_validflow:.2f} bytes")
    print(f"Average payload length per valid flow: {flow_len_per_validflow:.2f} bytes")

                    



In [None]:
coin_traffic_path = '\\doge_tshark\\doge_tx_20241128.cap'
coin_flow_path = '\\doge_tcpflow_new'


# 要解析的pcap包
pcap_file = 'D:\\Users\\ZitaGo\\PycharmProjects\\Transaction_analysis\\traffic_identification\\ET-BERT-main\\datasets\\coin_data' + coin_traffic_path

# 转化为流的目录
flow_dir = 'D:\\Users\\ZitaGo\\PycharmProjects\\Transaction_analysis\\traffic_identification\\ET-BERT-main\\datasets\\coin_data' + coin_flow_path


In [18]:
split_cap2flow(pcap_file, flow_dir)

Running command: D:/Users/ZitaGo/Downloads/SplitCap.exe -r D:\Users\ZitaGo\PycharmProjects\Transaction_analysis\traffic_identification\ET-BERT-main\datasets\coin_data\doge_tshark\doge_tx_20241128.cap -s flow -o D:\Users\ZitaGo\PycharmProjects\Transaction_analysis\traffic_identification\ET-BERT-main\datasets\coin_data\doge_burst
STDOUT: Splitting pcap file into seperate pcap files...
1%2%3%4%5%6%7%8%9%10%11%12%13%14%15%16%17%18%19%20%21%22%23%24%25%26%27%28%29%30%31%32%33%34%35%36%37%38%39%40%41%42%43%44%45%46%47%48%49%50%51%52%53%54%55%56%57%58%59%60%61%62%63%64%65%66%67%68%69%70%71%72%73%74%75%76%77%78%79%80%81%82%83%84%85%86%87%88%89%90%91%92%93%94%95%96%97%98%99%100%
Please wait while closing all file handles...

STDERR: 


0

In [110]:
flow_statistic(flow_dir)

Processed 1000 flows.
Processed 2000 flows.
Processed 3000 flows.
Processed 4000 flows.
Processed 5000 flows.
Processed 6000 flows.
Processed 7000 flows.
Processed 8000 flows.
Processed 9000 flows.
Processed 10000 flows.
Processed 11000 flows.
Processed 12000 flows.
Processed 13000 flows.
Processed 14000 flows.
Processed 15000 flows.
Processed 16000 flows.
Total flow number: 16762
Valid flow number: 10491
Invalid flow number: 6172
Average packet number per valid flow: 48.91
Average packet number per invalid flow: 1.97
Valid packet rate per valid flow: 25.05%
Average packet number per valid flow with valid payload: 26.84
Average payload length per valid packet: 2223.96 bytes
Average payload length per valid flow: 59684.07 bytes


In [7]:
def main(pcap_file, flow_dir):
    # 将数据包按照五元组分成会话流
    print(f"Parsing packets of {flow_dir.split('/')[-1].split('_')[0]} coin ...")
    split_cap2flow(pcap_file, flow_dir)

    print(f"Flow info. of {flow_dir.split('/')[-1].split('_')[0]} coin :\n")
    flow_statistic(flow_dir)

In [15]:
main(pcap_file, flow_dir)

Parsing packets of D:\Users\ZitaGo\PycharmProjects\Transaction coin ...
Running command: D:/Users/ZitaGo/Downloads/SplitCap.exe -r D:\Users\ZitaGo\PycharmProjects\Transaction_analysis\traffic_identification\ET-BERT-main\datasets\coin_data\doge_tshark\doge_tx_20241128.cap -s session -o D:\Users\ZitaGo\PycharmProjects\Transaction_analysis\traffic_identification\ET-BERT-main\datasets\coin_data\doge_tcpflow_new
STDOUT: Splitting pcap file into seperate pcap files...
1%2%3%4%5%6%7%8%9%10%11%12%13%14%15%16%17%18%19%20%21%22%23%24%25%26%27%28%29%30%31%32%33%34%35%36%37%38%39%40%41%42%43%44%45%46%47%48%49%50%51%52%53%54%55%56%57%58%59%60%61%62%63%64%65%66%67%68%69%70%71%72%73%74%75%76%77%78%79%80%81%82%83%84%85%86%87%88%89%90%91%92%93%94%95%96%97%98%99%100%
Please wait while closing all file handles...

STDERR: 
Flow info. of D:\Users\ZitaGo\PycharmProjects\Transaction coin :

Total flow number: 43
Valid flow number: 40
Invalid flow number: 3
Average packet number per valid flow: 11273.20
Aver