In [8]:
import pandas as pd

In [9]:
df = pd.read_csv("device.csv")

In [10]:
import pickle

with open('assignments.pkl', 'rb') as f:
    self_dict = pickle.load(f)

In [11]:
with open('shared.pkl', 'rb') as f:
    share_dict = pickle.load(f)

In [12]:
import warnings
warnings.filterwarnings("ignore")
self = []
for i in df.index:
    if df['pc'][i] == self_dict[df['user'][i]]:
        self.append('self')
    elif df['pc'][i] in share_dict:
        self.append('shared')
    else:
        self.append('other')
df['pc'] = self

In [18]:
set(self)

{'other', 'self'}

In [16]:
df['pc'].unique

<bound method Series.unique of 0          PC-5849
1          PC-6961
2          PC-1570
3          PC-6427
4          PC-6961
            ...   
1551823    PC-7733
1551824    PC-8657
1551825    PC-8657
1551826    PC-6551
1551827    PC-6551
Name: pc, Length: 1551828, dtype: object>

In [4]:
import pandas as pd
from collections import defaultdict

def process_device_file(file_path):

    df_device = pd.read_csv(file_path)
    df_device['date'] = pd.to_datetime(df_device['date'])

    user_pc_mapping = defaultdict(lambda: defaultdict(int))
    
    for index, row in df_device.iterrows():
        user = row['user']
        pc = row['pc']
        activity = row['activity']
        
 
        if activity == 'Connect':
            user_pc_mapping[user][pc] += 1
    
    primary_pcs = {}
    for user, pc_counts in user_pc_mapping.items():
        if pc_counts:  
            primary_pcs[user] = max(pc_counts.items(), key=lambda x: x[1])[0]
    

    df_device['token'] = 'NIL'
    

    for index, row in df_device.iterrows():
        user = row['user']
        pc = row['pc']
        activity = row['activity']
        
 
        pc_type = 'Self' if user in primary_pcs and primary_pcs[user] == pc else 'Other'
        

        df_device.loc[index, 'token'] = f"{pc_type}_{activity}"
        
        

        if pd.notna(row['file_tree']) and row['file_tree'].strip():
            tree_depth = len(row['file_tree'].split(';'))
            df_device.loc[index, 'token'] += f"_Depth{min(tree_depth, 3)}"  # Cap depth at 3 for simplicity
        else:
            df_device.loc[index, 'token'] += '_NoTree'
    
    return df_device

In [5]:
print("\nProcessing device.csv...")
df_device_processed = process_device_file('device.csv')


print("\nSample of tokenized device file:")
print(df_device_processed[['id', 'user', 'pc', 'activity', 'token']].head())


df_device_processed.to_csv('device_tokenized.csv', index=False)
print("Successfully generated tokens for device.csv")

Processing decoy_file.csv...

Sample of tokenized decoy file:
            decoy_filename       pc       token
0  C:\LJE2413\795JW126.jpg  PC-0302    High_jpg
1          C:\QMU9BC38.pdf  PC-6566     Low_pdf
2  C:\GIS1668\YPS1RSIK.jpg  PC-2606    High_jpg
3          C:\KD02AETE.pdf  PC-5393    High_pdf
4          C:\AUZTDD4J.jpg  PC-8753  Medium_jpg
Successfully generated tokens for decoy_file.csv

Processing device.csv...

Sample of tokenized device file:
                         id     user       pc    activity  \
0  {Z2Q8-K3AV28BE-9353JIRT}  SDH2394  PC-5849     Connect   
1  {C7F1-G7LE60RU-2483DAXS}  JKS2444  PC-6961     Connect   
2  {T9A4-D4RV69OF-1704NINW}  CBA1023  PC-1570     Connect   
3  {S8L0-O6QQ15NL-0636OYNV}  GNT0221  PC-6427     Connect   
4  {U0F1-R1FX27FM-6954TTVU}  JKS2444  PC-6961  Disconnect   

                                  token  
0     Primary_Connect_AfterHours_Depth3  
1     Primary_Connect_AfterHours_Depth2  
2     Primary_Connect_AfterHours_Depth3  
3     

In [19]:
import pandas as pd
import pickle
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("device.csv")

with open('assignments.pkl', 'rb') as f:
    self_dict = pickle.load(f)
    
with open('shared.pkl', 'rb') as f:
    share_dict = pickle.load(f)

def get_pc_type(user, pc):
    if pc == self_dict[user]:
        return 'self'
    elif pc in share_dict:
        return 'shared'
    else:
        return 'other'

def count_subdirectories(file_tree):
    if pd.isna(file_tree) or file_tree.strip() == '':
        return 0
    return len(file_tree.split(';'))

df['token'] = 'NIL'

for index, row in df.iterrows():

    pc_type = get_pc_type(row['user'], row['pc'])

    activity = row['activity'].lower()

    subdir_count = count_subdirectories(row['file_tree'])

    df.loc[index, 'token'] = f"{pc_type}_{activity}_{subdir_count}"

print("\nSample of tokenized device file:")
print(df[['id', 'user', 'pc', 'activity', 'token']].head())

df.to_csv('device_tokenized.csv', index=False)
print("Successfully generated tokens for device.csv")


Sample of tokenized device file:
                         id     user       pc    activity              token
0  {Z2Q8-K3AV28BE-9353JIRT}  SDH2394  PC-5849     Connect     self_connect_3
1  {C7F1-G7LE60RU-2483DAXS}  JKS2444  PC-6961     Connect     self_connect_2
2  {T9A4-D4RV69OF-1704NINW}  CBA1023  PC-1570     Connect     self_connect_6
3  {S8L0-O6QQ15NL-0636OYNV}  GNT0221  PC-6427     Connect     self_connect_2
4  {U0F1-R1FX27FM-6954TTVU}  JKS2444  PC-6961  Disconnect  self_disconnect_0
Successfully generated tokens for device.csv


In [20]:
df['token'].value_counts()

token
self_disconnect_0     755725
self_connect_4        170389
self_connect_6        163426
self_connect_5        151320
self_connect_2        143051
self_connect_3        132419
other_disconnect_0     17749
other_connect_3         5968
other_connect_4         4080
other_connect_2         3777
other_connect_6         1967
other_connect_5         1957
Name: count, dtype: int64