In [1]:
# Imports libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
import bson


In [2]:
# Load multiple BSON objects from a file
def load_multiple_bson_objects(file_path):
    data = []
    with open(file_path, 'rb') as file:
        for document in bson.decode_all(file.read()):
            data.append(document)
    return data

In [3]:
network_usage_logs=r"D:\mongofiles\data\mydatabase\network_usage_logs.bson"

In [4]:
network_usage_data= load_multiple_bson_objects(network_usage_logs)

In [5]:
network_usage_data=pd.DataFrame(network_usage_data)

In [6]:
network_usage_data.head()

Unnamed: 0,_id,user_id,username,timestamp,bytes_sent,bytes_recv,packets_sent,packets_recv,errin,errout,dropin,dropout,total_connections,status_counts,type_counts
0,665053963d97c3911fb3f036,66327fdb321870545078d205,user2,2024-05-24 12:45:10.477,40612106,360611960,127164,878909,0,0,0,0,2242,"{'LISTEN': 30, 'TIME_WAIT': 2008, 'NONE': 53, ...","{'SocketKind.SOCK_STREAM': 2189, 'SocketKind.S..."
1,665053eae3be87291c85cd7a,66403324c6d3b6e77f917976,user1,2024-05-24 12:46:34.692,41305634,363551102,129835,897377,0,0,0,0,2009,"{'TIME_WAIT': 1780, 'NONE': 49, 'ESTABLISHED':...","{'SocketKind.SOCK_STREAM': 1960, 'SocketKind.S..."
2,66505426e3be87291c85cd7c,66403324c6d3b6e77f917976,user1,2024-05-24 12:47:34.891,41516330,365038815,130940,909242,0,0,0,0,1777,"{'TIME_WAIT': 1544, 'NONE': 52, 'ESTABLISHED':...","{'SocketKind.SOCK_STREAM': 1725, 'SocketKind.S..."
3,6650557ef5bebd88f01ed21b,66322d94f89c7cd14b365ea8,mahad,2024-05-24 12:53:18.344,44254058,399045518,142220,988754,0,0,0,0,2192,"{'TIME_WAIT': 1980, 'ESTABLISHED': 131, 'NONE'...","{'SocketKind.SOCK_STREAM': 2142, 'SocketKind.S..."
4,665057e7b4bee6cd9cb5320f,66322d94f89c7cd14b365ea8,mahad,2024-05-24 13:03:35.432,96987861,541238403,168122,1190498,0,0,0,0,2176,"{'TIME_WAIT': 1959, 'ESTABLISHED': 135, 'NONE'...","{'SocketKind.SOCK_STREAM': 2126, 'SocketKind.S..."


In [7]:
network_usage_data.columns

Index(['_id', 'user_id', 'username', 'timestamp', 'bytes_sent', 'bytes_recv',
       'packets_sent', 'packets_recv', 'errin', 'errout', 'dropin', 'dropout',
       'total_connections', 'status_counts', 'type_counts'],
      dtype='object')

In [8]:
network_usage_data.dtypes

_id                          object
user_id                      object
username                     object
timestamp            datetime64[ns]
bytes_sent                    int64
bytes_recv                    int64
packets_sent                  int64
packets_recv                  int64
errin                         int64
errout                        int64
dropin                        int64
dropout                       int64
total_connections             int64
status_counts                object
type_counts                  object
dtype: object

In [9]:
network_usage_data['type_counts'][0]

{'SocketKind.SOCK_STREAM': 2189, 'SocketKind.SOCK_DGRAM': 53}

In [10]:
network_usage_data['status_counts'][0]

{'LISTEN': 30,
 'TIME_WAIT': 2008,
 'NONE': 53,
 'ESTABLISHED': 149,
 'CLOSE_WAIT': 2}

In [11]:
# Split dictionary values some columns.
def split_dict_column(df, column_name):
    dict_df = df[column_name].apply(pd.Series)
    df = df.drop(column_name, axis=1).join(dict_df)
    return df


In [12]:
network_usage_data = split_dict_column(network_usage_data, 'type_counts')

In [13]:
network_usage_data = split_dict_column(network_usage_data, 'status_counts')

In [14]:
network_usage_data.head()

Unnamed: 0,_id,user_id,username,timestamp,bytes_sent,bytes_recv,packets_sent,packets_recv,errin,errout,...,total_connections,SocketKind.SOCK_STREAM,SocketKind.SOCK_DGRAM,LISTEN,TIME_WAIT,NONE,ESTABLISHED,CLOSE_WAIT,SYN_SENT,LAST_ACK
0,665053963d97c3911fb3f036,66327fdb321870545078d205,user2,2024-05-24 12:45:10.477,40612106,360611960,127164,878909,0,0,...,2242,2189,53,30.0,2008.0,53.0,149.0,2.0,,
1,665053eae3be87291c85cd7a,66403324c6d3b6e77f917976,user1,2024-05-24 12:46:34.692,41305634,363551102,129835,897377,0,0,...,2009,1960,49,30.0,1780.0,49.0,148.0,1.0,1.0,
2,66505426e3be87291c85cd7c,66403324c6d3b6e77f917976,user1,2024-05-24 12:47:34.891,41516330,365038815,130940,909242,0,0,...,1777,1725,52,30.0,1544.0,52.0,150.0,,,1.0
3,6650557ef5bebd88f01ed21b,66322d94f89c7cd14b365ea8,mahad,2024-05-24 12:53:18.344,44254058,399045518,142220,988754,0,0,...,2192,2142,50,30.0,1980.0,50.0,131.0,1.0,,
4,665057e7b4bee6cd9cb5320f,66322d94f89c7cd14b365ea8,mahad,2024-05-24 13:03:35.432,96987861,541238403,168122,1190498,0,0,...,2176,2126,50,30.0,1959.0,50.0,135.0,1.0,1.0,


In [15]:
network_usage_data.columns


Index(['_id', 'user_id', 'username', 'timestamp', 'bytes_sent', 'bytes_recv',
       'packets_sent', 'packets_recv', 'errin', 'errout', 'dropin', 'dropout',
       'total_connections', 'SocketKind.SOCK_STREAM', 'SocketKind.SOCK_DGRAM',
       'LISTEN', 'TIME_WAIT', 'NONE', 'ESTABLISHED', 'CLOSE_WAIT', 'SYN_SENT',
       'LAST_ACK'],
      dtype='object')

In [16]:
network_usage_data.drop(columns=['_id','user_id'],inplace=True)

In [17]:
network_usage_data.columns

Index(['username', 'timestamp', 'bytes_sent', 'bytes_recv', 'packets_sent',
       'packets_recv', 'errin', 'errout', 'dropin', 'dropout',
       'total_connections', 'SocketKind.SOCK_STREAM', 'SocketKind.SOCK_DGRAM',
       'LISTEN', 'TIME_WAIT', 'NONE', 'ESTABLISHED', 'CLOSE_WAIT', 'SYN_SENT',
       'LAST_ACK'],
      dtype='object')

In [18]:
network_usage_data.set_index('timestamp',inplace=True)

In [19]:
network_usage_data.columns

Index(['username', 'bytes_sent', 'bytes_recv', 'packets_sent', 'packets_recv',
       'errin', 'errout', 'dropin', 'dropout', 'total_connections',
       'SocketKind.SOCK_STREAM', 'SocketKind.SOCK_DGRAM', 'LISTEN',
       'TIME_WAIT', 'NONE', 'ESTABLISHED', 'CLOSE_WAIT', 'SYN_SENT',
       'LAST_ACK'],
      dtype='object')

In [20]:
network_usage_data.head()

Unnamed: 0_level_0,username,bytes_sent,bytes_recv,packets_sent,packets_recv,errin,errout,dropin,dropout,total_connections,SocketKind.SOCK_STREAM,SocketKind.SOCK_DGRAM,LISTEN,TIME_WAIT,NONE,ESTABLISHED,CLOSE_WAIT,SYN_SENT,LAST_ACK
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2024-05-24 12:45:10.477,user2,40612106,360611960,127164,878909,0,0,0,0,2242,2189,53,30.0,2008.0,53.0,149.0,2.0,,
2024-05-24 12:46:34.692,user1,41305634,363551102,129835,897377,0,0,0,0,2009,1960,49,30.0,1780.0,49.0,148.0,1.0,1.0,
2024-05-24 12:47:34.891,user1,41516330,365038815,130940,909242,0,0,0,0,1777,1725,52,30.0,1544.0,52.0,150.0,,,1.0
2024-05-24 12:53:18.344,mahad,44254058,399045518,142220,988754,0,0,0,0,2192,2142,50,30.0,1980.0,50.0,131.0,1.0,,
2024-05-24 13:03:35.432,mahad,96987861,541238403,168122,1190498,0,0,0,0,2176,2126,50,30.0,1959.0,50.0,135.0,1.0,1.0,


In [21]:
#Next
# Apply Processing steps