Подключаем необходимые библиотеки

In [18]:
import pandas as pd
import numpy as np

Отключаем предупреждения, так как некоторые предупреждения не удалось устранить, но всё работает исправно

In [19]:
import warnings
# Выключить предупреждения
warnings.filterwarnings('ignore')
# Включить предупреждения
# warnings.filterwarnings('default')

Функция для выделения признаков из датафрейма

Принимает в себя сам датафрейм и параметр, который означает количество частей, на которые будет разбит весь датасет(по умолчанию 20) 

In [20]:
def extract_features_from_df(df, num_parts=20):
    if df.empty:
        return {}

    # Направления трафика от сервера или от клиента(server/client соответственно)
    client_ip = df['source_ip'].value_counts().idxmax()
    df['direction'] = df['source_ip'].apply(lambda ip: 'client' if ip == client_ip else 'server')

    # Убираем повреждённые строки(то есть строки, где в некоторых столбцах стоит N/A)
    columns_to_check = ['frame','protocol','source_ip','source_port','dest_ip','dest_port','frame_length','date','time','milliseconds']
    df.replace("N/A", np.nan, inplace=True)
    df = df.dropna(subset=columns_to_check)
    df['data'] = df['data'].apply(lambda x: len(x) if isinstance(x, str) else 0)
    df['data'] = df['data'].replace(np.nan, 0)
    df['data'] = df['data'].astype(int)

    # Формирование portion_id(порции данных)
    df = df.sort_values('milliseconds')
    df['direction_change'] = df['direction'] != df['direction'].shift(1)
    df['portion_id'] = df['direction_change'].cumsum()

    # Разбиение на части(для того, чтобы увеличить количество классов подаваемых в модель)
    portion_size = len(df) // num_parts
    metrics_list = []

    for i in range(num_parts):
        start_idx = i * portion_size
        end_idx = (i + 1) * portion_size if i != num_parts - 1 else len(df)
        df_part = df.iloc[start_idx:end_idx]

        # Пакеты
        metrics = {}
        metrics['mean_packet_size_client'] = df_part[df_part['direction'] == 'client']['frame_length'].mean()
        metrics['std_packet_size_client'] = df_part[df_part['direction'] == 'client']['frame_length'].std()
        metrics['mean_packet_size_server'] = df_part[df_part['direction'] == 'server']['frame_length'].mean()
        metrics['std_packet_size_server'] = df_part[df_part['direction'] == 'server']['frame_length'].std()

        # Порции данных
        portions = df_part.groupby(['portion_id', 'direction'])['data'].sum().reset_index()
        client_portions = portions[portions['direction'] == 'client']
        server_portions = portions[portions['direction'] == 'server']

        metrics['mean_data_per_portion_client'] = client_portions['data'].mean()
        metrics['std_data_per_portion_client'] = client_portions['data'].std()
        metrics['mean_data_per_portion_server'] = server_portions['data'].mean()
        metrics['std_data_per_portion_server'] = server_portions['data'].std()

        # Кол-во пакетов на порцию
        packets_per_portion = df_part.groupby(['portion_id', 'direction']).size().reset_index(name='packet_count')
        metrics['mean_packets_per_portion_client'] = packets_per_portion[packets_per_portion['direction'] == 'client']['packet_count'].mean()
        metrics['mean_packets_per_portion_server'] = packets_per_portion[packets_per_portion['direction'] == 'server']['packet_count'].mean()

        # КПД
        total_data_client = df_part[df_part['direction'] == 'client']['data'].sum()
        total_bytes_client = df_part[df_part['direction'] == 'client']['frame_length'].sum()
        total_data_server = df_part[df_part['direction'] == 'server']['data'].sum()
        total_bytes_server = df_part[df_part['direction'] == 'server']['frame_length'].sum()

        metrics['efficiency_client'] = total_data_client / total_bytes_client if total_bytes_client else 0
        metrics['efficiency_server'] = total_data_server / total_bytes_server if total_bytes_server else 0

        # Соотношения
        metrics['byte_ratio'] = total_bytes_client / total_bytes_server if total_bytes_server else 0
        metrics['payload_ratio'] = total_data_client / total_data_server if total_data_server else 0
        metrics['packet_ratio'] = df_part[df_part['direction'] == 'client'].shape[0] / df_part[df_part['direction'] == 'server'].shape[0] if df_part[df_part['direction'] == 'server'].shape[0] else 0

        # Первые пакеты
        client_packets = df_part[df_part['direction'] == 'client'].sort_values('milliseconds')
        server_packets = df_part[df_part['direction'] == 'server'].sort_values('milliseconds')

        client_packet_sizes = client_packets['frame_length'].tolist()
        server_packet_sizes = server_packets['frame_length'].tolist()

        metrics['first_packet_size_client'] = client_packet_sizes[0] if len(client_packet_sizes) > 0 else 0
        metrics['second_packet_size_client'] = client_packet_sizes[1] if len(client_packet_sizes) > 1 else 0
        metrics['first_packet_size_server'] = server_packet_sizes[0] if len(server_packet_sizes) > 0 else 0
        metrics['second_packet_size_server'] = server_packet_sizes[1] if len(server_packet_sizes) > 1 else 0

        # Первые порции
        client_data_portions = client_portions.sort_values('portion_id')['data'].tolist()
        server_data_portions = server_portions.sort_values('portion_id')['data'].tolist()

        metrics['first_data_portion_client'] = client_data_portions[0] if len(client_data_portions) > 0 else 0
        metrics['second_data_portion_client'] = client_data_portions[1] if len(client_data_portions) > 1 else 0
        metrics['first_data_portion_server'] = server_data_portions[0] if len(server_data_portions) > 0 else 0
        metrics['second_data_portion_server'] = server_data_portions[1] if len(server_data_portions) > 1 else 0

        # Протокол
        metrics['transport_protocol'] = int(df_part['protocol'].mode()[0] == 6)

        metrics_list.append(metrics)

    return metrics_list

Функция для выделения признаков из датафрейма(для предсказания)

Внутрь подаётся уже сама часть датафрейма(то есть после разбиения на части) 

In [21]:
def extract_features_from_df_(df):
    if df.empty:
        return {}

    # Определение направления трафика
    client_ip = df['source_ip'].value_counts().idxmax()
    df['direction'] = df['source_ip'].apply(lambda ip: 'client' if ip == client_ip else 'server')

    # Обработка data
    columns_to_check = ['frame','protocol','source_ip','source_port','dest_ip','dest_port','frame_length','date','time','milliseconds']
    df.replace("N/A", np.nan, inplace=True)
    df = df.dropna(subset=columns_to_check)
    df['data'] = df['data'].apply(lambda x: len(x) if isinstance(x, str) else 0)
    df['data'] = df['data'].replace(np.nan, 0)
    df['data'] = df['data'].astype(int)

    # Формирование portion_id
    df = df.sort_values('milliseconds')
    df['direction_change'] = df['direction'] != df['direction'].shift(1)
    df['portion_id'] = df['direction_change'].cumsum()

    # Метрики
    metrics = {}

    # Пакеты
    metrics['mean_packet_size_client'] = df[df['direction'] == 'client']['frame_length'].mean()
    metrics['std_packet_size_client'] = df[df['direction'] == 'client']['frame_length'].std()
    metrics['mean_packet_size_server'] = df[df['direction'] == 'server']['frame_length'].mean()
    metrics['std_packet_size_server'] = df[df['direction'] == 'server']['frame_length'].std()

    # Порции данных
    portions = df.groupby(['portion_id', 'direction'])['data'].sum().reset_index()
    client_portions = portions[portions['direction'] == 'client']
    server_portions = portions[portions['direction'] == 'server']

    metrics['mean_data_per_portion_client'] = client_portions['data'].mean()
    metrics['std_data_per_portion_client'] = client_portions['data'].std()
    metrics['mean_data_per_portion_server'] = server_portions['data'].mean()
    metrics['std_data_per_portion_server'] = server_portions['data'].std()

    # Кол-во пакетов на порцию
    packets_per_portion = df.groupby(['portion_id', 'direction']).size().reset_index(name='packet_count')
    metrics['mean_packets_per_portion_client'] = packets_per_portion[packets_per_portion['direction'] == 'client']['packet_count'].mean()
    metrics['mean_packets_per_portion_server'] = packets_per_portion[packets_per_portion['direction'] == 'server']['packet_count'].mean()

    # КПД
    total_data_client = df[df['direction'] == 'client']['data'].sum()
    total_bytes_client = df[df['direction'] == 'client']['frame_length'].sum()
    total_data_server = df[df['direction'] == 'server']['data'].sum()
    total_bytes_server = df[df['direction'] == 'server']['frame_length'].sum()

    metrics['efficiency_client'] = total_data_client / total_bytes_client if total_bytes_client else 0
    metrics['efficiency_server'] = total_data_server / total_bytes_server if total_bytes_server else 0

    # Соотношения
    metrics['byte_ratio'] = total_bytes_client / total_bytes_server if total_bytes_server else 0
    metrics['payload_ratio'] = total_data_client / total_data_server if total_data_server else 0
    metrics['packet_ratio'] = df[df['direction'] == 'client'].shape[0] / df[df['direction'] == 'server'].shape[0] if df[df['direction'] == 'server'].shape[0] else 0

    # Первые пакеты
    client_packets = df[df['direction'] == 'client'].sort_values('milliseconds')
    server_packets = df[df['direction'] == 'server'].sort_values('milliseconds')

    client_packet_sizes = client_packets['frame_length'].tolist()
    server_packet_sizes = server_packets['frame_length'].tolist()

    metrics['first_packet_size_client'] = client_packet_sizes[0] if len(client_packet_sizes) > 0 else 0
    metrics['second_packet_size_client'] = client_packet_sizes[1] if len(client_packet_sizes) > 1 else 0
    metrics['first_packet_size_server'] = server_packet_sizes[0] if len(server_packet_sizes) > 0 else 0
    metrics['second_packet_size_server'] = server_packet_sizes[1] if len(server_packet_sizes) > 1 else 0

    # Первые порции
    client_data_portions = client_portions.sort_values('portion_id')['data'].tolist()
    server_data_portions = server_portions.sort_values('portion_id')['data'].tolist()

    metrics['first_data_portion_client'] = client_data_portions[0] if len(client_data_portions) > 0 else 0
    metrics['second_data_portion_client'] = client_data_portions[1] if len(client_data_portions) > 1 else 0
    metrics['first_data_portion_server'] = server_data_portions[0] if len(server_data_portions) > 0 else 0
    metrics['second_data_portion_server'] = server_data_portions[1] if len(server_data_portions) > 1 else 0

    # Протокол
    metrics['transport_protocol'] = int(df['protocol'].mode()[0] == 6)

    return metrics

Пути к файлам с фреймами

In [23]:
file_paths = {
    "game": [
        './src/Valheim_TCP.PANDAS',
        './src/Dota2_UDP.PANDAS',
        './src/CounterStrike2_UDP.PANDAS',
         # './src/Verdun_UDP.PANDAS',
         # './src/Warhammer_TCP.PANDAS',
        './src/Warhammer_UDP.PANDAS',
        "./src/riot_lol_TCP.PANDAS",
    ],

    "web_serfing_and_video": [
        "./src/web_surfing_TCP.PANDAS",
        "./src/web_serfing_tcp_udp_TCP.PANDAS",
        "./src/web_serfing_tcp_udp_UDP.PANDAS",
        "./src/web_serfing_1_TCP.PANDAS",
        # "./src/web_serfing_2_TCP.PANDAS",
        "./src/web_serfing_1_UDP.PANDAS",
        # "./src/web_serfing_2_UDP.PANDAS",
        
        "./src/web_video_UDP.PANDAS",
        "./src/web_video_TCP.PANDAS",
        "./src/web_video_1_TCP.PANDAS",
        "./src/web_video_1_UDP.PANDAS",
        # "./src/web_video_2_UDP.PANDAS",
        # "./src/web_video_2_TCP.PANDAS",
    ],
    "web_streaming":[
        "./src/streaming_TCP.PANDAS",
        "./src/streaming_udp_UDP.PANDAS",
    ],
    "downloading_files":[
        "./src/downloading_files_TCP.PANDAS",
        "./src/downloading_files_1_TCP.PANDAS",
        # "./src/downloading_files_2_TCP.PANDAS",
        './src/torrent_tcp_TCP.PANDAS',
        './src/torrent_utp_UDP.PANDAS',
    ],
}

Обработка каждого файла, выделение признаков и загрузка в один датафрейм

In [24]:
all_results = []

for label, paths in file_paths.items():
    for path in paths:
        print(f"Обрабатываем файл: {path}")
        df = pd.read_pickle(path)
        
        features_list = extract_features_from_df(df)
        
        for features in features_list:
            features['label'] = label
            all_results.append(features)

final_df = pd.DataFrame(all_results)

Обрабатываем файл: ./src/Valheim_TCP.PANDAS
Обрабатываем файл: ./src/Dota2_UDP.PANDAS
Обрабатываем файл: ./src/CounterStrike2_UDP.PANDAS
Обрабатываем файл: ./src/Warhammer_UDP.PANDAS
Обрабатываем файл: ./src/riot_lol_TCP.PANDAS
Обрабатываем файл: ./src/web_surfing_TCP.PANDAS
Обрабатываем файл: ./src/web_serfing_tcp_udp_TCP.PANDAS
Обрабатываем файл: ./src/web_serfing_tcp_udp_UDP.PANDAS
Обрабатываем файл: ./src/web_serfing_1_TCP.PANDAS
Обрабатываем файл: ./src/web_serfing_1_UDP.PANDAS
Обрабатываем файл: ./src/web_video_UDP.PANDAS
Обрабатываем файл: ./src/web_video_TCP.PANDAS
Обрабатываем файл: ./src/web_video_1_TCP.PANDAS
Обрабатываем файл: ./src/web_video_1_UDP.PANDAS
Обрабатываем файл: ./src/streaming_TCP.PANDAS
Обрабатываем файл: ./src/streaming_udp_UDP.PANDAS
Обрабатываем файл: ./src/downloading_files_TCP.PANDAS
Обрабатываем файл: ./src/downloading_files_1_TCP.PANDAS
Обрабатываем файл: ./src/torrent_tcp_TCP.PANDAS
Обрабатываем файл: ./src/torrent_utp_UDP.PANDAS


Выввод финального датафрейма для наглядности

In [25]:
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', None)
final_df

Unnamed: 0,mean_packet_size_client,std_packet_size_client,mean_packet_size_server,std_packet_size_server,mean_data_per_portion_client,std_data_per_portion_client,mean_data_per_portion_server,std_data_per_portion_server,mean_packets_per_portion_client,mean_packets_per_portion_server,efficiency_client,efficiency_server,byte_ratio,payload_ratio,packet_ratio,first_packet_size_client,second_packet_size_client,first_packet_size_server,second_packet_size_server,first_data_portion_client,second_data_portion_client,first_data_portion_server,second_data_portion_server,transport_protocol,label
0,54.000000,0.000000,1125.200000,591.727565,0.000000,0.000000,0.000000,0.000000,1.333333,3.333333,0.000000,0.000000,0.019197,0.000000,0.400000,54,54,60,1506,0,0,0,0,1,game
1,328.000000,354.375507,856.111111,598.883637,0.000000,0.000000,0.000000,0.000000,1.666667,4.500000,0.000000,0.000000,0.212849,0.000000,0.555556,212,212,890,819,0,0,0,0,1,game
2,779.571429,481.806012,,,0.000000,,,,14.000000,,0.000000,0.000000,0.000000,0.000000,0.000000,262,951,0,0,0,0,0,0,1,game
3,179.666667,194.681963,727.375000,725.832121,0.000000,0.000000,0.000000,0.000000,1.500000,2.666667,0.000000,0.000000,0.185255,0.000000,0.750000,54,54,66,66,0,0,0,0,1,game
4,346.375000,373.476692,228.500000,306.343435,0.000000,0.000000,0.000000,0.000000,2.666667,3.000000,0.000000,0.000000,2.021152,0.000000,1.333333,54,212,60,60,0,0,0,0,1,game
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,68.165118,47.880846,983.464763,426.767130,71.312081,126.976261,6541.178017,5412.180240,1.594590,3.474159,0.656072,1.914465,0.031813,0.010902,0.458986,62,62,1434,590,40,0,10112,5656,0,downloading_files
396,67.200947,47.903514,1005.409779,427.065468,71.153128,129.830728,6462.129461,5246.194725,1.624532,3.353909,0.651764,1.916379,0.032375,0.011011,0.484370,62,62,626,626,0,40,2336,1096,0,downloading_files
397,68.348339,60.805162,1008.486686,426.946237,72.660878,160.201782,6615.961008,5379.498670,1.608193,3.422841,0.661050,1.916620,0.031841,0.010982,0.469817,62,62,626,626,120,80,10296,2784,0,downloading_files
398,69.255463,69.994130,1017.130690,427.049697,77.656100,184.927851,6580.052007,5303.787293,1.607574,3.374018,0.697510,1.917367,0.032442,0.011802,0.476457,62,65,1096,590,40,46,3204,20752,0,downloading_files


Подключение библиотеки sklearn для работы со случайным лесом, разбиванием выборки на тестовую и тренировочную и отчёта по точности модели

train_test_split - разбивание исходного датафрейма на обучающую и тестовую выборки(в данном случае 60 на 40)

RandomForestClassifier - построение модели классификатора на базе Случайного леса

print(classification_report(y_test, y_pred)) - вывод результатов точности модели на тестовой выборке

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X = final_df.drop(columns=['label'])
y = final_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    stratify=y, 
                                                    test_size=0.4, 
                                                    # random_state=42,
                                                   )

model = RandomForestClassifier(
                                class_weight='balanced',
                               # random_state=42,
                               )
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

                       precision    recall  f1-score   support

    downloading_files       1.00      1.00      1.00        32
                 game       1.00      0.80      0.89        40
web_serfing_and_video       0.89      1.00      0.94        72
        web_streaming       0.93      0.88      0.90        16

             accuracy                           0.94       160
            macro avg       0.96      0.92      0.93       160
         weighted avg       0.94      0.94      0.94       160



Функция для предсказания класса новых фреймов

Внутрь подаётся путь к файлу, модель, которая использовалась для классификации и количество частей разбиения.

Для того, чтобы избежать неточностей, нужно разбивать фреймы при обучении и предсказании на одно и то же количество частей!

In [27]:
def predict_class(file_path, model, num_parts=20):
    try:
        full_df = pd.read_pickle(file_path)
    except Exception as e:
        return f"Ошибка при загрузке файла: {e}"

    if full_df.empty:
        return "Файл пуст"

    parts = np.array_split(full_df, num_parts)

    class_names = model.classes_
    all_probas = []

    for i, part_df in enumerate(parts):
        try:
            features = extract_features_from_df_(part_df)
            if not features:
                continue
            features_df = pd.DataFrame([features])
            probas = model.predict_proba(features_df)[0]
            all_probas.append(probas)
        except Exception as e:
            print(f"Ошибка в части {i}: {e}")
            continue

    if not all_probas:
        return "Не удалось извлечь признаки ни из одной части"

    mean_probas = np.mean(all_probas, axis=0)
    predicted_idx = np.argmax(mean_probas)
    predicted_class = class_names[predicted_idx]

    proba_str = "\n".join([
        f"{class_name}: {mean_probas[i]:.3f}" for i, class_name in enumerate(class_names)
    ])

    return f"""Предсказанный класс: {predicted_class}

Вероятности:
{proba_str}
"""

In [28]:
print(predict_class('/Users/mac/Desktop/NSTU/project/Rostelecom_game_dump/Verdun_UDP.PANDAS',model))

Предсказанный класс: game

Вероятности:
downloading_files: 0.067
game: 0.494
web_serfing_and_video: 0.316
web_streaming: 0.124



In [29]:
print(predict_class('/Users/mac/Desktop/NSTU/project/Rostelecom_game_dump/Warhammer_UDP.PANDAS',model))

Предсказанный класс: game

Вероятности:
downloading_files: 0.021
game: 0.828
web_serfing_and_video: 0.137
web_streaming: 0.014



In [30]:
print(predict_class("/Users/mac/Desktop/NSTU/project/Documents_/web_serfing_2_UDP.PANDAS",model))

Предсказанный класс: web_serfing_and_video

Вероятности:
downloading_files: 0.020
game: 0.111
web_serfing_and_video: 0.859
web_streaming: 0.009



In [31]:
print(predict_class("/Users/mac/Desktop/NSTU/project/Documents_/web_video_2_UDP.PANDAS",model))

Предсказанный класс: web_serfing_and_video

Вероятности:
downloading_files: 0.104
game: 0.080
web_serfing_and_video: 0.710
web_streaming: 0.108



In [32]:
print(predict_class("/Users/mac/Downloads/Documents/streaming_TCP.PANDAS",model))

Предсказанный класс: web_streaming

Вероятности:
downloading_files: 0.035
game: 0.157
web_serfing_and_video: 0.042
web_streaming: 0.767



In [33]:
print(predict_class("/Users/mac/Desktop/NSTU/project/Documents_/downloading_files_2_TCP.PANDAS",model))

Предсказанный класс: downloading_files

Вероятности:
downloading_files: 0.777
game: 0.076
web_serfing_and_video: 0.033
web_streaming: 0.115



Вывод признаков и их важности для модели.

In [34]:
importances = model.feature_importances_
features = X.columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.DataFrame({'feature': features, 'importance': importances}).sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
1,std_packet_size_client,0.112085
0,mean_packet_size_client,0.110758
12,byte_ratio,0.108962
4,mean_data_per_portion_client,0.065676
2,mean_packet_size_server,0.060678
16,second_packet_size_client,0.05033
5,std_data_per_portion_client,0.045515
10,efficiency_client,0.044374
14,packet_ratio,0.043168
15,first_packet_size_client,0.04199
