In [1]:
import pandas as pd
file_df=pd.read_csv('file.csv')

In [2]:
file_df.head()

Unnamed: 0,id,date,user,pc,filename,content
0,{L9G8-J9QE34VM-2834VDPB},01/02/2010 07:23:14,MOH0273,PC-6699,EYPC9Y08.doc,D0-CF-11-E0-A1-B1-1A-E1 during difficulty over...
1,{H0W6-L4FG38XG-9897XTEN},01/02/2010 07:26:19,MOH0273,PC-6699,N3LTSU3O.pdf,25-50-44-46-2D carpenters 25 landed strait dis...
2,{M3Z0-O2KK89OX-5716MBIM},01/02/2010 08:12:03,HPH0075,PC-2417,D3D3WC9W.doc,D0-CF-11-E0-A1-B1-1A-E1 union 24 declined impo...
3,{E1I4-S4QS61TG-3652YHKR},01/02/2010 08:17:00,HPH0075,PC-2417,QCSW62YS.doc,D0-CF-11-E0-A1-B1-1A-E1 becoming period begin ...
4,{D4R7-E7JL45UX-0067XALT},01/02/2010 08:24:57,HSB0196,PC-8001,AU75JV6U.jpg,FF-D8


In [3]:
file_df.isnull().sum()

id          0
date        0
user        0
pc          0
filename    0
content     0
dtype: int64

In [4]:
file_df['date'] = pd.to_datetime(file_df['date'], errors='coerce')

In [5]:


# Define normal work hours (8 AM to 6 PM)
work_hour_start = 8
work_hour_end = 18

# Function to classify file type based on filename extension
def classify_file_type(filename):
    document_extensions = ['.doc', '.pdf', '.txt', '.jpg', '.png', '.jpeg', '.docx', '.xls', '.xlsx']
    program_extensions = ['.exe', '.bat', '.sh', '.cmd', '.bin']
    
    if any(filename.lower().endswith(ext) for ext in document_extensions):
        return 'document'
    elif any(filename.lower().endswith(ext) for ext in program_extensions):
        return 'program'
    return 'other'

# Apply file type classification to the 'filename' column
file_df['file_type'] = file_df['filename'].apply(classify_file_type)

# Extract hour from the 'date' column
file_df['hour'] = file_df['date'].dt.hour

# Define "own PC" and "other PC" based on 'pc' column
own_pc_prefix = 'PC-'  # This is an example, adjust according to your own PC naming convention
file_df['is_own_pc'] = file_df['pc'].apply(lambda x: 1 if x.startswith(own_pc_prefix) else 0)

# Create features for document and program file copying activities
file_df['documents_copy_own_pc'] = ((file_df['file_type'] == 'document') & (file_df['is_own_pc'] == 1)).astype(int)
file_df['documents_copy_other_pc'] = ((file_df['file_type'] == 'document') & (file_df['is_own_pc'] == 0)).astype(int)
file_df['program_files_copy_own_pc'] = ((file_df['file_type'] == 'program') & (file_df['is_own_pc'] == 1)).astype(int)
file_df['program_files_copy_other_pc'] = ((file_df['file_type'] == 'program') & (file_df['is_own_pc'] == 0)).astype(int)

# Determine if activity is during off-hours (before 9 AM or after 6 PM)
file_df['documents_copy_own_pc_off_hour'] = ((file_df['documents_copy_own_pc'] == 1) & ((file_df['hour'] < work_hour_start) | (file_df['hour'] >= work_hour_end))).astype(int)
file_df['documents_copy_other_pc_off_hour'] = ((file_df['documents_copy_other_pc'] == 1) & ((file_df['hour'] < work_hour_start) | (file_df['hour'] >= work_hour_end))).astype(int)
file_df['program_files_copy_own_pc_off_hour'] = ((file_df['program_files_copy_own_pc'] == 1) & ((file_df['hour'] < work_hour_start) | (file_df['hour'] >= work_hour_end))).astype(int)
file_df['program_files_copy_other_pc_off_hour'] = ((file_df['program_files_copy_other_pc'] == 1) & ((file_df['hour'] < work_hour_start) | (file_df['hour'] >= work_hour_end))).astype(int)

# Print or return the dataframe with new features
print(file_df[['documents_copy_own_pc', 'documents_copy_other_pc', 'program_files_copy_own_pc', 'program_files_copy_other_pc',
               'documents_copy_own_pc_off_hour', 'documents_copy_other_pc_off_hour', 'program_files_copy_own_pc_off_hour', 
               'program_files_copy_other_pc_off_hour']])


        documents_copy_own_pc  documents_copy_other_pc  \
0                           1                        0   
1                           1                        0   
2                           1                        0   
3                           1                        0   
4                           1                        0   
...                       ...                      ...   
445576                      1                        0   
445577                      1                        0   
445578                      1                        0   
445579                      1                        0   
445580                      1                        0   

        program_files_copy_own_pc  program_files_copy_other_pc  \
0                               0                            0   
1                               0                            0   
2                               0                            0   
3                               0      

In [6]:
file_df.drop(columns=['id','filename','content','file_type','hour','is_own_pc', 
     ], inplace=True)

In [7]:
file_df.isnull().sum()

date                                    0
user                                    0
pc                                      0
documents_copy_own_pc                   0
documents_copy_other_pc                 0
program_files_copy_own_pc               0
program_files_copy_other_pc             0
documents_copy_own_pc_off_hour          0
documents_copy_other_pc_off_hour        0
program_files_copy_own_pc_off_hour      0
program_files_copy_other_pc_off_hour    0
dtype: int64

In [8]:
file_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445581 entries, 0 to 445580
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype         
---  ------                                --------------   -----         
 0   date                                  445581 non-null  datetime64[ns]
 1   user                                  445581 non-null  object        
 2   pc                                    445581 non-null  object        
 3   documents_copy_own_pc                 445581 non-null  int64         
 4   documents_copy_other_pc               445581 non-null  int64         
 5   program_files_copy_own_pc             445581 non-null  int64         
 6   program_files_copy_other_pc           445581 non-null  int64         
 7   documents_copy_own_pc_off_hour        445581 non-null  int64         
 8   documents_copy_other_pc_off_hour      445581 non-null  int64         
 9   program_files_copy_own_pc_off_hour    445581 non-null  int6

In [9]:

# Sort by user_id and logon_time
file_df = file_df.sort_values(by=['user', 'date']).reset_index(drop=True)

In [10]:
file_df.shape

(445581, 11)

In [11]:


# Count total duplicate rows
total_duplicates =file_df.duplicated().sum()
print(f"Total duplicate rows in the dataset: {total_duplicates}")

# Count duplicates based on 'user' and 'date'
column_duplicates = file_df.duplicated(subset=['user', 'date']).sum()
print(f"Total duplicate rows based on 'user' and 'date': {column_duplicates}")

# Display duplicate rows (full duplicates)
if total_duplicates > 0:
    print("\nSome fully duplicated rows:")
    print(file_df[file_df.duplicated(keep=False)].head())

# Display duplicate rows based on 'user' and 'date'
if column_duplicates > 0:
    print("\nSome duplicate rows based on 'user' and 'date':")
    print(file_df[file_df.duplicated(subset=['user', 'date'], keep=False)].head())

# Step 1: Remove duplicates based on 'user' and 'date' first
file_df = file_df.drop_duplicates(subset=['user', 'date'], keep='first')

# Step 2: Remove any remaining fully duplicated rows
file_df = file_df.drop_duplicates(keep='first')

# Reset index after dropping duplicates
file_df = file_df.reset_index(drop=True)

# Final dataset shape after cleaning
print("\nDuplicates removed. Updated dataset shape:", file_df.shape)


Total duplicate rows in the dataset: 5821
Total duplicate rows based on 'user' and 'date': 6467

Some fully duplicated rows:
                    date     user       pc  documents_copy_own_pc  \
1094 2010-10-15 09:50:11  AHD0848  PC-7751                      1   
1095 2010-10-15 09:50:11  AHD0848  PC-7751                      1   
1292 2010-01-28 13:52:13  AHM0410  PC-3686                      1   
1293 2010-01-28 13:52:13  AHM0410  PC-3686                      1   
1542 2010-03-30 16:20:03  AHM0410  PC-3686                      1   

      documents_copy_other_pc  program_files_copy_own_pc  \
1094                        0                          0   
1095                        0                          0   
1292                        0                          0   
1293                        0                          0   
1542                        0                          0   

      program_files_copy_other_pc  documents_copy_own_pc_off_hour  \
1094                          

In [14]:
file_df.to_csv('file_cleaned.csv', index=False)