In [None]:
import pandas as pd

# Чтение данных
logon_df = pd.read_csv("data/cert4.2/logon.csv")
device_df = pd.read_csv("data/cert4.2/device.csv")
email_df = pd.read_csv("data/cert4.2/email.csv")
http_df = pd.read_csv("data/cert4.2/http.csv")
psych_df = pd.read_csv("data/cert4.2/psychometric.csv")

# Преобразование даты и извлечение часа
for df in [logon_df, device_df, email_df, http_df]:
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df['hour'] = df['date'].dt.hour

# Агрегация поведенческих признаков
logon_agg = logon_df.groupby('user').agg(
    login_count=('activity', 'count'),
    after_hours_logins=('hour', lambda x: ((x < 7) | (x > 18)).sum()),
    avg_logon_hour=('hour', 'mean')
)

device_agg = device_df.groupby('user').agg(
    usb_use_count=('activity', 'count'),
    usb_after_hours=('hour', lambda x: ((x < 7) | (x > 18)).sum())
)

http_agg = http_df.groupby('user').agg(
    http_visit_count=('url', 'count'),
    unique_sites=('url', pd.Series.nunique)
)

email_agg = email_df.groupby('user').agg(
    email_sent_count=('id', 'count'),
    avg_email_size=('size', 'mean'),
    avg_attachment_count=('attachments', 'mean')
)

# Объединение всех фичей
users_df = logon_agg.join(device_agg, how='outer') \
                    .join(http_agg, how='outer') \
                    .join(email_agg, how='outer') \
                    .reset_index()

# Объединение с психометрическими признаками
psych_df = psych_df.rename(columns={'user_id': 'user'})
users_df = users_df.merge(psych_df, on='user', how='left')

# Обработка пропущенных значений
users_df.fillna(0, inplace=True)

# Сохранение
users_df['id'] = users_df['user']
users_df.set_index('id', inplace=True)
users_df.to_csv("data/cert4.2/out/nodes.csv")


In [15]:
# Email-ресурсы
resource_email = email_df[['to']].dropna().copy()
resource_email['resource'] = resource_email['to'].str.split(';')
resource_email = resource_email.explode('resource').drop(columns=['to'])
resource_email['resource_type'] = 'email'

# HTTP-ресурсы
resource_http = http_df[['url']].dropna().copy()
resource_http = resource_http.rename(columns={'url': 'resource'})
resource_http['resource_type'] = 'url'

# PC и USB устройства
resource_pc = logon_df[['pc']].dropna().copy().rename(columns={'pc': 'resource'})
resource_pc['resource_type'] = 'pc'

resource_usb = device_df[['pc']].dropna().copy().rename(columns={'pc': 'resource'})
resource_usb['resource_type'] = 'usb'

# Объединение всех ресурсов
resources_df = pd.concat([resource_email, resource_http, resource_pc, resource_usb], ignore_index=True)
resources_df.drop_duplicates(inplace=True)
resources_df['resource_id'] = ['R' + str(i) for i in range(len(resources_df))]

# Сохранение
resources_df.to_csv("data/cert4.2/out/resources.csv", index=False)

In [16]:
# Связи email (user → to)
email_edges = email_df[['user', 'to']].dropna().copy()
email_edges['target'] = email_edges['to'].str.split(';')
email_edges = email_edges.explode('target').drop(columns=['to'])
email_edges['edge_type'] = 'email'

# Связи http (user → url)
http_edges = http_df[['user', 'url']].dropna().copy()
http_edges = http_edges.rename(columns={'url': 'target'})
http_edges['edge_type'] = 'http'

# Связи pc и usb (user → pc)
logon_edges = logon_df[['user', 'pc']].dropna().copy().rename(columns={'pc': 'target'})
logon_edges['edge_type'] = 'pc'

device_edges = device_df[['user', 'pc']].dropna().copy().rename(columns={'pc': 'target'})
device_edges['edge_type'] = 'usb'

# Объединение всех связей
edges_df = pd.concat([email_edges, http_edges, logon_edges, device_edges], ignore_index=True)

# Присоединение resource_id
resources_df = pd.read_csv("data/cert4.2/out/resources.csv")
edges_df = edges_df.merge(resources_df, left_on='target', right_on='resource', how='inner')

# Финальный формат
edges_df_final = edges_df[['user', 'resource_id', 'edge_type']]
edges_df_final = edges_df_final.rename(columns={'user': 'source_user', 'resource_id': 'target_resource'})

# Сохранение
edges_df_final.to_csv("data/cert4.2/out/edges.csv", index=False)
