In [None]:
import os
import pandas as pd
import numpy as np
import pytest
from pathlib import Path

from data_pipeline import (
    load_and_concat_csvs,
    add_datetime_index,
    engineer_time_features,
    clean_data,
    apply_transformers,
    bin_features,
    map_labels,
    add_time_feature,
    drop_and_select,
    split_and_balance,
)
from sklearn.preprocessing import FunctionTransformer

  df_out.loc[:, present] = transformer.fit_transform(data)
  df_out.loc[:, present] = transformer.fit_transform(data)
  df_out.loc[:, present] = transformer.fit_transform(data)
  df_out.loc[:, present] = transformer.fit_transform(data)
  0.20337897]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_out.loc[:, present] = transformer.fit_transform(data)
  0.59244364]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_out.loc[:, present] = transformer.fit_transform(data)
  0.20337897]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_out.loc[:, present] = transformer.fit_transform(data)
  0.59244364]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_out.loc[:, present] = transformer.fit_transform(data)
  1.25      ]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_out.loc[:, pr

SMOTE will use k_neighbors=4 (min class count = 5)


In [None]:
@pytest.fixture
def tmp_csv_dir(tmp_path):
    """Створює тимчасову теку з двома CSV для load_and_concat_csvs."""
    d = tmp_path / "data"
    d.mkdir()
    df1 = pd.DataFrame({
        'A': [1, 2],
        'Label': ['BENIGN', 'DDoS']
    })
    df1.to_csv(d / "Monday.csv", index=False)
    df2 = pd.DataFrame({
        'A': [3, 4],
        'Label': ['Bot', 'PortScan']
    })
    df2.to_csv(d / "Tuesday.csv", index=False)
    return d

In [None]:
def test_load_and_concat_csvs(tmp_csv_dir):
    df = load_and_concat_csvs(tmp_csv_dir)
    assert 'Day' in df.columns
    assert df.shape[0] == 4
    assert set(df['Day']) == {'Monday', 'Tuesday'}

In [None]:
def test_load_and_concat_csvs(tmp_csv_dir):
    df = load_and_concat_csvs(tmp_csv_dir)
    assert 'Day' in df.columns
    assert df.shape[0] == 4
    assert set(df['Day']) == {'Monday', 'Tuesday'}

In [None]:
def test_add_datetime_index(tmp_csv_dir):
    df = load_and_concat_csvs(tmp_csv_dir)
    date_map = {'Monday':'2023-01-01 00:00:00', 'Tuesday':'2023-01-02 00:00:00'}
    df2 = add_datetime_index(df, date_map)
    assert 'Day' not in df2.columns
    assert pd.api.types.is_datetime64_any_dtype(df2.index)
    expected = pd.to_datetime(list(date_map.values()))
    assert set(df2.index.unique()) == set(expected)

In [None]:
def test_engineer_time_features():
    idx = pd.to_datetime(['2021-01-01 05:00', '2021-01-02 15:30'])
    df = pd.DataFrame(index=idx)
    out = engineer_time_features(df)
    assert 'dow' in out.columns and 'hour' in out.columns
    # 2021-01-01 — п’ятниця (4), 05:00
    assert out.loc[idx[0], 'dow'] == 4
    assert out.loc[idx[0], 'hour'] == 5
    # 2021-01-02 — субота (5), 15:30
    assert out.loc[idx[1], 'dow'] == 5
    assert out.loc[idx[1], 'hour'] == 15

In [None]:
def test_clean_data():
    df = pd.DataFrame({
        'num': [1, -5, np.inf, 2],
        'Init_Win_bytes_forward': [0, 0, 0, 0],
        'Label': ['a','b','c','d']
    })
    out = clean_data(df)
    # негативні та нескінченності замінено, пустих рядків не лишилось
    assert (out['num'] >= 0).all()
    assert not out.isna().any().any()

In [None]:
def test_apply_transformers():
    df = pd.DataFrame({'x': [0, 1, 2, 3]})
    transformers = {'log1p': FunctionTransformer(np.log1p, validate=False)}
    out = apply_transformers(df, {'log1p': ['x']}, transformers)
    assert np.allclose(out['x'], np.log1p(df['x']))

In [None]:
def test_bin_features():
    df = pd.DataFrame({'v': [10, 20, 30, 40, 50]})
    out = bin_features(df, ['v'], n_bins=5)
    assert 'v_bin' in out.columns
    assert set(out['v_bin']) == set(range(5))

In [None]:
def test_map_labels():
    df = pd.DataFrame({'Label': ['BENIGN', 'DDoS', 'Bot']})
    cats = {
        'BENIGN': ['BENIGN'],
        'DoS': ['DDoS'],
        'Bot_Infiltration': ['Bot']
    }
    out = map_labels(df, cats)
    assert 'Category' in out.columns and 'label_code' in out.columns
    assert out.loc[0, 'Category'] == 'BENIGN'
    # DoS має індекс 1 у словнику
    assert out.loc[1, 'label_code'] == 1

In [None]:
def test_add_time_feature():
    df = pd.DataFrame({'dow': [0, 2], 'hour': [5, 18]})
    out = add_time_feature(df)
    assert out.loc[0, 'time'] == 0*24 + 5
    assert out.loc[1, 'time'] == 2*24 + 18

In [None]:
def test_drop_and_select():
    df = pd.DataFrame({
        'A': [1], 'B': [2], 'label_code': [0], 'composite': [10]
    })
    # видаляємо B, залишаємо A, та перевіряємо, що label_code і composite на місці
    out = drop_and_select(df, ['B'], {}, ['A'], [])
    assert 'A' in out.columns
    assert 'label_code' in out.columns
    assert 'composite' in out.columns
    assert 'B' not in out.columns

In [None]:
def test_split_and_balance():
    # Мінімальний приклад з двома класами по 2 зразки
    df = pd.DataFrame({
        'f': [1,2,3,4],
        'label_code': [0,0,1,1],
        'composite': [0,1,168,169]
    })
    res = split_and_balance(df, test_size=0.5, val_split=0.5,
                            random_state=0, smote_state=0)
    # Має бути 6 ключів
    assert set(res.keys()) == {
        'X_train_bal','y_train_bal','X_val','y_val','X_test','y_test'
    }
    # Загальна кількість зразків після розбиття має дорівнювати початковій
    total = (res['X_train_bal'].shape[0] +
             res['X_val'].shape[0] +
             res['X_test'].shape[0])
    assert total == df.shape[0]

In [None]:
def test_export_datasets(tmp_path):
    df1 = pd.DataFrame({'x':[1,2]})
    df2 = pd.DataFrame({'y':[3,4]})
    out = tmp_path / "out"
    # export створює теку автоматично
    export_datasets({'a': df1, 'b': df2}, str(out))
    assert (out / "a.csv").exists()
    assert (out / "b.csv").exists()