In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from datetime import datetime
import re
import bson

 # simple Exploration

In [2]:
# Define function to load multiple BSON objects from a file
def load_multiple_bson_objects(file_path):
    data = []
    with open(file_path, 'rb') as file:
        for document in bson.decode_all(file.read()):
            data.append(document)
    return data


In [3]:
general_data_path="D:\mongofiles\data\mydatabase\general_data.bson"

In [4]:
general_data = load_multiple_bson_objects(general_data_path)

In [5]:
# Convert general data to a DataFrame
general_data = pd.DataFrame(general_data)

In [6]:
general_data.shape

(3, 16)

In [7]:
general_data.columns

Index(['_id', 'user_id', 'username', 'ip', 'mac', 'location', 'city',
       'country', 'coordinates', 'isp', 'number_of_screens', 'browser_info',
       'browser_version', 'platform', 'user_agent', 'timestamp'],
      dtype='object')

In [8]:
general_data.head()

Unnamed: 0,_id,user_id,username,ip,mac,location,city,country,coordinates,isp,number_of_screens,browser_info,browser_version,platform,user_agent,timestamp
0,665057e8b4bee6cd9cb53210,66322d94f89c7cd14b365ea8,mahad,94.204.58.82,F4:39:09:77:D5:5B,,,,"[N/A, N/A]",,1,,,,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:1...,2024-05-24 13:03:36.930
1,665058db079089a76c98e652,66322d94f89c7cd14b365ea8,mahad,94.204.58.82,F4:39:09:77:D5:5B,Dubai,Dubai,United Arab Emirates,"[25.0731, 55.298]",Emirates Integrated Telecommunications Company...,1,,,,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:1...,2024-05-24 13:07:39.245
2,66509f82dfa5d34b327309f2,66322d94f89c7cd14b365ea8,mahad,94.204.58.82,F4:39:09:77:D5:5B,Dubai,Dubai,United Arab Emirates,"[25.0731, 55.298]",Emirates Integrated Telecommunications Company...,1,,,,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:1...,2024-05-24 18:09:06.004


In [9]:
# Convert '_id' to string if it is not already
general_data.drop(columns=['_id'],inplace=True)
# Replace 'N/A' with NaN
general_data.replace('N/A', np.nan, inplace=True)
general_data.fillna({
    'browser_info': 'Unknown',
    'browser_version': 'Unknown',
    'platform': 'Unknown'
}, inplace=True)

In [10]:
# Extract browser and OS from 'user_agent'
def extract_browser_info(user_agent):
    if pd.isna(user_agent):
        return 'Unknown', 'Unknown'
    browser = re.search(r'(Firefox|Chrome|Safari|Opera|Edge|MSIE)', user_agent)
    os = re.search(r'\((.*?)\)', user_agent)
    return (browser.group(0) if browser else 'Unknown', os.group(1) if os else 'Unknown')

general_data['browser'],general_data['os'] = zip(*general_data['user_agent'].apply(extract_browser_info))
general_data.drop(columns=['user_agent'],inplace=True)


In [11]:
# Standardize 'number_of_screens'
scaler = StandardScaler()
general_data['number_of_screens'] = scaler.fit_transform(general_data[['number_of_screens']])

In [12]:
# Label encoding for 'user_id'
label_encoder = LabelEncoder()
general_data['user_id'] = label_encoder.fit_transform(general_data['user_id'])


In [13]:
general_data.head()

Unnamed: 0,user_id,username,ip,mac,location,city,country,coordinates,isp,number_of_screens,browser_info,browser_version,platform,timestamp,browser,os
0,0,mahad,94.204.58.82,F4:39:09:77:D5:5B,,,,"[N/A, N/A]",,0.0,Unknown,Unknown,Unknown,2024-05-24 13:03:36.930,Firefox,Windows NT 10.0; Win64; x64; rv:126.0
1,0,mahad,94.204.58.82,F4:39:09:77:D5:5B,Dubai,Dubai,United Arab Emirates,"[25.0731, 55.298]",Emirates Integrated Telecommunications Company...,0.0,Unknown,Unknown,Unknown,2024-05-24 13:07:39.245,Firefox,Windows NT 10.0; Win64; x64; rv:126.0
2,0,mahad,94.204.58.82,F4:39:09:77:D5:5B,Dubai,Dubai,United Arab Emirates,"[25.0731, 55.298]",Emirates Integrated Telecommunications Company...,0.0,Unknown,Unknown,Unknown,2024-05-24 18:09:06.004,Firefox,Windows NT 10.0; Win64; x64; rv:126.0


# more steps TODO
# drop NaN
# encode the catigorical features

# END