In [106]:
# 주요 라이브러리 
import copy
import re
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
throughput_interval = 0.1

In [153]:
# 커널 로그를 기반으로 패킷 송수신 내용 추출
def read(file_name, throughput_analysis=True):
    # 파일 읽기
    with open(file_name) as json_file:
        message = json_file.read()
        
    # 정규식 추출
    # 예시: tcp_header_size,-544,507,48
    # 데이터 이름, Flow id, 시간, 값
    p = re.compile('''([a-z\_0-9]+),([\-0-9]+),([0-9]+),([0-9]+)''')
    result = p.finditer(message)
    
    temp = []
    for r in result:
        key = str(r.group(1))
        flow_id = int(r.group(2))
        flow_time = int(r.group(3))
        value = int(r.group(4))
        temp.append([flow_time, flow_id, key, value])

    # [time, flow_id, key, value] 형태의 테이블 생성
    # 그러나 MPTCP가 아닌 다른 플로우 데이터도 함께 존재함
    table = pd.DataFrame(temp, columns=['time', 'flow_id', 'key', 'value'])
    
    # 가장 많은 데이터를 보낸 3개의 플로우를 제외한 나머지 제거
    table_not_included_meta_flow = table[table['flow_id'] != 0]
    remove_id = table_not_included_meta_flow['flow_id'].value_counts().keys()[2:]
    table = table.loc[table['flow_id'].isin(remove_id) == False]
    
    # 첫 번째 subflow 찾기 (=가장 처음으로 로그가 발생한 플로우)
    temp = table[table['flow_id'] != 0] # MPTCP 레벨이 아닌 것 중
    subflow_id = temp.sort_values(["time"], ascending=[True])
    first_flow = subflow_id.iloc[0]['flow_id']
    
    # 각 flow별로 네임태그 지정
    table.loc[table['flow_id'] == 0, 'flow_id'] = 'mptcp'
    table.loc[table['flow_id'] == first_flow, 'flow_id'] = 'sub1'
    table.loc[(table['flow_id'] != 'mptcp') & (table['flow_id'] != 'sub1') , 'flow_id'] = 'sub2'
    
    # 시간 값 정규화 (커널 시간 -> FLow 시작 시간)
    min_time = table.loc[:,"time"].min() # 기준 값 설정
    
    # 250hz의 세분성
    table.loc[:,"time"] = (table.loc[:,"time"] - min_time) / 250
    
    table.name = file_name
    
    # Throughput 및 총 전송량 추가
    total_throughput = []
    
    # range 함수는 int만 지원
    bins = [i / 1000 for i in range(0, 60000, int(throughput_interval * 1000))]

    option = []
    for subflow in ['sub1', 'sub2']:
        flow = select(table, subflow, "send")
        transmission = flow.groupby(pd.cut(flow['time'], bins=bins)).sum()
        y = [i / 1024 / 1024 for i in list(transmission['value'])]

        flow = select(table, subflow, "retransmission")
        retransmission = flow.groupby(pd.cut(flow['time'], bins=bins)).sum()

        sent = 0
        for i in range(0, len(list(transmission['value']))):
            sent += transmission['value'].iloc[i]
            option.append([bins[i], subflow, 'throughput', y[i] / throughput_interval])
            option.append([bins[i], subflow, 'retransmission', retransmission['value'].iloc[i]])
            option.append([bins[i], subflow, 'total_bytes', sent])
    
    # total_bytes for mptcp level
    flow = table.loc[(table['key'] == 'mptcp_send') & (table['value'] >= 10000)].copy()
    flow['value'] -= 10000
    transmission_redundant = flow.groupby(pd.cut(flow['time'], bins=bins)).sum()
    
    flow = table.loc[(table['key'] == 'mptcp_send') & (table['value'] < 10000)]
    transmission_real = flow.groupby(pd.cut(flow['time'], bins=bins)).sum()
    
    sent = 0
    sent_included_redundant = 0
    for i in range(0, len(list(transmission['value']))):
        sent += transmission_real['value'].iloc[i]
        sent_included_redundant += transmission_real['value'].iloc[i]
        sent_included_redundant += transmission_redundant['value'].iloc[i]
        option.append([bins[i], 'mptcp', 'total_bytes', sent])
        option.append([bins[i], 'mptcp', 'total_bytes_included_redundant', sent_included_redundant])
        
    temp = pd.DataFrame(option, columns=['time', 'flow_id', 'key', 'value'])
    table = pd.concat([table, temp])
    table = table.loc[table['key'] != 'mptcp_send']
    
    return table

# 특정 subflow의 데이터를 선택하는 함수
def select(table, subflow, key):
    flow = table.loc[table["flow_id"] == subflow]
    flow = flow.loc[flow['key'] == key]
    return flow

In [1]:
def save(table, file_name):
    for subflow in table["flow_id"].unique():
        table2 = table.loc[table["flow_id"] == subflow]
        for key in table2["key"].unique():
            table3 = table2.loc[table2['key'] == key]
            base_path = "saves/" + file_name
            if os.path.isdir(base_path) == False:
                os.mkdir(base_path)
            table3.to_csv(base_path + "/" + subflow + "_" + key + ".csv")

In [108]:
save(default, "그림3_default")

In [154]:
prefix = "logs/"
default = read(prefix + "default.txt");
redundant = read(prefix + "redundant.txt");
redundant00085 = read(prefix + "redundant00085.txt");
redundant00165 = read(prefix + "redundant00165.txt");

In [155]:
save(default, "그림3_default")
save(redundant, "그림4_redundant")
save(redundant00165, "그림5_redundant_5MBps_0.016간격")
save(redundant00085, "그림5_redundant_5MBps_0.008간격")