In [1]:
import pandas as pd

# Load datasets
logon_df = pd.read_csv('logon.csv')
ldpa_data = pd.read_csv('ldpa_combined.csv')

In [2]:
logon_df.head()

Unnamed: 0,id,date,user_id,pc,activity
0,{X1D9-S0ES98JV-5357PWMI},01/02/2010 06:49:00,NGF0157,PC-6056,Logon
1,{G2B3-L6EJ61GT-2222RKSO},01/02/2010 06:50:00,LRR0148,PC-4275,Logon
2,{U6Q3-U0WE70UA-3770UREL},01/02/2010 06:53:04,LRR0148,PC-4124,Logon
3,{I0N5-R7NA26TG-6263KNGM},01/02/2010 07:00:00,IRM0931,PC-7188,Logon
4,{D1S0-N6FH62BT-5398KANK},01/02/2010 07:00:00,MOH0273,PC-6699,Logon


In [3]:
ldpa_data.head()

Unnamed: 0,employee_name,user_id,email,role,business_unit,functional_unit,department,team,supervisor
0,Calvin Edan Love,CEL0561,Calvin.Edan.Love@dtaa.com,ComputerProgrammer,1,2 - ResearchAndEngineering,2 - SoftwareManagement,3 - Software,Stephanie Briar Harrington
1,Christine Reagan Deleon,CRD0624,Christine.Reagan.Deleon@dtaa.com,Salesman,1,5 - SalesAndMarketing,2 - Sales,3 - RegionalSales,Winter Veda Burks
2,Jade Felicia Caldwell,JFC0557,Jade.Felicia.Caldwell@dtaa.com,SoftwareEngineer,1,2 - ResearchAndEngineering,2 - SoftwareManagement,3 - Software,Stephanie Briar Harrington
3,Aquila Stewart Dejesus,ASD0577,Aquila.Stewart.Dejesus@dtaa.com,ProductionLineWorker,1,3 - Manufacturing,3 - Assembly,3 - AssemblyDept,Whilemina Pandora England
4,Micah Abdul Rojas,MAR0955,Micah.Abdul.Rojas@dtaa.com,ProductionLineWorker,1,3 - Manufacturing,3 - Assembly,6 - AssemblyDept,Sandra Beverly Diaz


In [4]:
# Initialize lists for paired and unpaired events
paired = []
unpaired = []

# Track logon events
logon_stack = {}

# Iterate through the data to pair logon and logoff events
for idx, row in logon_df.iterrows():
    key = (row["user_id"], row["pc"])  # Unique identifier for user and PC
    
    if row["activity"] == "Logon":
        # Store the logon event if not already logged in
        if key in logon_stack:
            unpaired.append({
                "user_id": logon_stack[key]["user_id"],
                "pc": logon_stack[key]["pc"],
                "logon_time": logon_stack[key]["date"],
                "logoff_time": None  # No corresponding logoff
            })
        logon_stack[key] = row  # Store the current logon event
    
    elif row["activity"] == "Logoff":
        if key in logon_stack:
            logon_event = logon_stack.pop(key)
            paired.append({
                "user_id": row["user_id"],
                "pc": row["pc"],
                "logon_time": logon_event["date"],
                "logoff_time": row["date"]
            })
        else:
            unpaired.append({
                "user_id": row["user_id"],
                "pc": row["pc"],
                "logon_time": None,  # No corresponding logon
                "logoff_time": row["date"]
            })

# Convert lists to DataFrames if needed
paired_df = pd.DataFrame(paired)
unpaired_df = pd.DataFrame(unpaired)

# Display results
print(f"Total Paired Sessions: {len(paired_df)}")
print(f"Total Unpaired Events: {len(unpaired_df)}")


Total Paired Sessions: 384251
Total Unpaired Events: 86357


In [5]:

logon_df = paired_df



In [6]:

# Load the employee data (assuming it's included in the logon files)
employee_data = ldpa_data[['employee_name', 'user_id', 'role', ]].drop_duplicates()

In [7]:
# Merge on 'user_id'
logon_df = logon_df.merge(employee_data, on="user_id", how="left")


In [8]:
employee_data.head()

Unnamed: 0,employee_name,user_id,role
0,Calvin Edan Love,CEL0561,ComputerProgrammer
1,Christine Reagan Deleon,CRD0624,Salesman
2,Jade Felicia Caldwell,JFC0557,SoftwareEngineer
3,Aquila Stewart Dejesus,ASD0577,ProductionLineWorker
4,Micah Abdul Rojas,MAR0955,ProductionLineWorker


In [9]:
logon_df.isnull().sum()

user_id          0
pc               0
logon_time       0
logoff_time      0
employee_name    0
role             0
dtype: int64

In [10]:
logon_df.columns

Index(['user_id', 'pc', 'logon_time', 'logoff_time', 'employee_name', 'role'], dtype='object')

In [11]:
logon_df.head()

Unnamed: 0,user_id,pc,logon_time,logoff_time,employee_name,role
0,LRR0148,PC-4124,01/02/2010 06:53:04,01/02/2010 07:24:53,Libby Rosalyn Richard,Manager
1,NOB0181,PC-4124,01/02/2010 07:40:42,01/02/2010 07:46:55,Nevada Odette Bass,Technician
2,IKP0472,PC-4124,01/02/2010 07:49:05,01/02/2010 07:58:29,Ivana Kaitlin Parsons,ProductionLineWorker
3,IRM0931,PC-4124,01/02/2010 08:00:33,01/02/2010 08:16:18,Ignatius Reese Morton,ComputerProgrammer
4,WPR0368,PC-2173,01/02/2010 14:06:32,01/02/2010 14:08:41,William Price Robles,ITAdmin


In [12]:
logon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384251 entries, 0 to 384250
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   user_id        384251 non-null  object
 1   pc             384251 non-null  object
 2   logon_time     384251 non-null  object
 3   logoff_time    384251 non-null  object
 4   employee_name  384251 non-null  object
 5   role           384251 non-null  object
dtypes: object(6)
memory usage: 17.6+ MB


In [13]:
# Assuming you have a dataframe called 'df'
logon_df['logon_time'] = pd.to_datetime(logon_df['logon_time'], errors='coerce')
logon_df['logoff_time'] = pd.to_datetime(logon_df['logoff_time'], errors='coerce')


In [14]:
logon_df.head()

Unnamed: 0,user_id,pc,logon_time,logoff_time,employee_name,role
0,LRR0148,PC-4124,2010-01-02 06:53:04,2010-01-02 07:24:53,Libby Rosalyn Richard,Manager
1,NOB0181,PC-4124,2010-01-02 07:40:42,2010-01-02 07:46:55,Nevada Odette Bass,Technician
2,IKP0472,PC-4124,2010-01-02 07:49:05,2010-01-02 07:58:29,Ivana Kaitlin Parsons,ProductionLineWorker
3,IRM0931,PC-4124,2010-01-02 08:00:33,2010-01-02 08:16:18,Ignatius Reese Morton,ComputerProgrammer
4,WPR0368,PC-2173,2010-01-02 14:06:32,2010-01-02 14:08:41,William Price Robles,ITAdmin


In [15]:
# Check for any NaT (Not a Time) values that may have occurred due to incorrect parsing
print(logon_df[logon_df["logon_time"].isna()])


Empty DataFrame
Columns: [user_id, pc, logon_time, logoff_time, employee_name, role]
Index: []


In [16]:
logon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384251 entries, 0 to 384250
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   user_id        384251 non-null  object        
 1   pc             384251 non-null  object        
 2   logon_time     384251 non-null  datetime64[ns]
 3   logoff_time    384251 non-null  datetime64[ns]
 4   employee_name  384251 non-null  object        
 5   role           384251 non-null  object        
dtypes: datetime64[ns](2), object(4)
memory usage: 17.6+ MB


In [17]:
# Extract session start and end times
logon_df["start_time"] = logon_df["logon_time"]
logon_df["end_time"] = logon_df["logoff_time"]

# Compute session duration in minutes (handling NaT cases)
logon_df["session_duration"] = ((logon_df["logoff_time"] - logon_df["logon_time"])
                                .dt.total_seconds()
                                .fillna(0) / 60)

# Extract logon hour and day of the week
logon_df["logon_hour"] = logon_df["logon_time"].dt.hour
logon_df["day_of_week"] = logon_df["logon_time"].dt.dayofweek  # Monday = 0, Sunday = 6

# Define normal working hours (e.g., 08:00 - 18:00)
NORMAL_HOURS_START = 8
NORMAL_HOURS_END = 18

# Identify the most frequently used PC for each user
primary_pc = logon_df.groupby(["user_id", "pc"]).size().reset_index(name="count")

# Handle ties in primary PC selection by sorting before picking the first one
primary_pc = primary_pc.sort_values(["user_id", "count"], ascending=[True, False]).drop_duplicates("user_id", keep="first")

# Create a dictionary mapping users to their primary PC
user_pc_mapping = dict(zip(primary_pc["user_id"], primary_pc["pc"]))

# Assign primary PC to each user (handling missing cases)
logon_df["own_pc"] = logon_df["user_id"].map(user_pc_mapping).fillna("Unknown")

# Determine logon type
logon_df["logon_on_own_pc"] = (logon_df["pc"] == logon_df["own_pc"]).astype(int)
logon_df["logon_on_other_pc"] = (logon_df["pc"] != logon_df["own_pc"]).astype(int)

# Classify logon activity based on normal/off-hours
logon_df["logon_on_own_pc_normal"] = (((logon_df["logon_hour"] >= NORMAL_HOURS_START) & 
                                       (logon_df["logon_hour"] < NORMAL_HOURS_END)) & 
                                       (logon_df["logon_on_own_pc"] == 1)).astype(int)

logon_df["logon_on_own_pc_off_hour"] = (((logon_df["logon_hour"] < NORMAL_HOURS_START) | 
                                         (logon_df["logon_hour"] >= NORMAL_HOURS_END)) & 
                                         (logon_df["logon_on_own_pc"] == 1)).astype(int)

logon_df["logon_on_other_pc_normal"] = (((logon_df["logon_hour"] >= NORMAL_HOURS_START) & 
                                         (logon_df["logon_hour"] < NORMAL_HOURS_END)) & 
                                         (logon_df["logon_on_other_pc"] == 1)).astype(int)

logon_df["logon_on_other_pc_off_hour"] = (((logon_df["logon_hour"] < NORMAL_HOURS_START) | 
                                           (logon_df["logon_hour"] >= NORMAL_HOURS_END)) & 
                                           (logon_df["logon_on_other_pc"] == 1)).astype(int)


In [18]:
logon_df.head()

Unnamed: 0,user_id,pc,logon_time,logoff_time,employee_name,role,start_time,end_time,session_duration,logon_hour,day_of_week,own_pc,logon_on_own_pc,logon_on_other_pc,logon_on_own_pc_normal,logon_on_own_pc_off_hour,logon_on_other_pc_normal,logon_on_other_pc_off_hour
0,LRR0148,PC-4124,2010-01-02 06:53:04,2010-01-02 07:24:53,Libby Rosalyn Richard,Manager,2010-01-02 06:53:04,2010-01-02 07:24:53,31.816667,6,5,PC-4275,0,1,0,0,0,1
1,NOB0181,PC-4124,2010-01-02 07:40:42,2010-01-02 07:46:55,Nevada Odette Bass,Technician,2010-01-02 07:40:42,2010-01-02 07:46:55,6.216667,7,5,PC-3446,0,1,0,0,0,1
2,IKP0472,PC-4124,2010-01-02 07:49:05,2010-01-02 07:58:29,Ivana Kaitlin Parsons,ProductionLineWorker,2010-01-02 07:49:05,2010-01-02 07:58:29,9.4,7,5,PC-3842,0,1,0,0,0,1
3,IRM0931,PC-4124,2010-01-02 08:00:33,2010-01-02 08:16:18,Ignatius Reese Morton,ComputerProgrammer,2010-01-02 08:00:33,2010-01-02 08:16:18,15.75,8,5,PC-7188,0,1,0,0,1,0
4,WPR0368,PC-2173,2010-01-02 14:06:32,2010-01-02 14:08:41,William Price Robles,ITAdmin,2010-01-02 14:06:32,2010-01-02 14:08:41,2.15,14,5,PC-9842,0,1,0,0,1,0


In [19]:
logon_df[['user_id', 'logon_time', 'logoff_time', 'session_duration']]

Unnamed: 0,user_id,logon_time,logoff_time,session_duration
0,LRR0148,2010-01-02 06:53:04,2010-01-02 07:24:53,31.816667
1,NOB0181,2010-01-02 07:40:42,2010-01-02 07:46:55,6.216667
2,IKP0472,2010-01-02 07:49:05,2010-01-02 07:58:29,9.400000
3,IRM0931,2010-01-02 08:00:33,2010-01-02 08:16:18,15.750000
4,WPR0368,2010-01-02 14:06:32,2010-01-02 14:08:41,2.150000
...,...,...,...,...
384246,QNP0216,2011-05-16 23:37:23,2011-05-16 23:49:58,12.583333
384247,JDF0593,2011-05-17 03:18:35,2011-05-17 04:40:33,81.966667
384248,MLM0950,2011-05-17 01:22:19,2011-05-17 05:21:58,239.650000
384249,IRM0931,2011-05-17 05:25:33,2011-05-17 06:23:48,58.250000


In [20]:

# Convert time columns to datetime
logon_df['logon_time'] = pd.to_datetime(logon_df['logon_time'])
logon_df['logoff_time'] = pd.to_datetime(logon_df['logoff_time'])

# Sort by user_id and logon_time
logon_df = logon_df.sort_values(by=['user_id', 'logon_time']).reset_index(drop=True)

# Define threshold for merging (e.g., 10 minutes)
merge_threshold = pd.Timedelta(minutes=10)

# Initialize merged sessions
merged_sessions = []
current_session = logon_df.iloc[0]

for i in range(1, len(logon_df)):
    row = logon_df.iloc[i]

    # Check if the session belongs to the same user and is within the merge threshold
    if row['user_id'] == current_session['user_id'] and row['logon_time'] - current_session['logoff_time'] <= merge_threshold:
        # Extend the logoff_time and update session duration
        current_session['logoff_time'] = row['logoff_time']
        current_session['session_duration'] = (current_session['logoff_time'] - current_session['logon_time']).total_seconds() / 60
    else:
        # Append the finalized session and move to the next session
        merged_sessions.append(current_session)
        current_session = row

# Append the last session
merged_sessions.append(current_session)

# Convert to DataFrame
merged_logon_df = pd.DataFrame(merged_sessions)

print(merged_logon_df[['user_id', 'logon_time', 'logoff_time', 'session_duration']])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  current_session['logoff_time'] = row['logoff_time']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  current_session['session_duration'] = (current_session['logoff_time'] - current_session['logon_time']).total_seconds() / 60


        user_id          logon_time         logoff_time  session_duration
0       AAE0190 2010-01-04 08:09:00 2010-01-04 18:20:00             611.0
1       AAE0190 2010-01-05 08:19:00 2010-01-05 18:11:00             592.0
2       AAE0190 2010-01-06 08:09:00 2010-01-06 18:10:00             601.0
3       AAE0190 2010-01-07 08:23:00 2010-01-07 18:14:00             591.0
4       AAE0190 2010-01-08 08:17:00 2010-01-08 18:26:00             609.0
...         ...                 ...                 ...               ...
384246  ZSL0305 2011-05-10 08:53:00 2011-05-10 18:09:00             556.0
384247  ZSL0305 2011-05-11 08:51:00 2011-05-11 17:56:00             545.0
384248  ZSL0305 2011-05-12 08:58:00 2011-05-12 17:56:00             538.0
384249  ZSL0305 2011-05-13 09:03:00 2011-05-13 17:58:00             535.0
384250  ZSL0305 2011-05-16 09:05:00 2011-05-16 17:55:00             530.0

[366340 rows x 4 columns]


In [21]:
merged_logon_df.columns

Index(['user_id', 'pc', 'logon_time', 'logoff_time', 'employee_name', 'role',
       'start_time', 'end_time', 'session_duration', 'logon_hour',
       'day_of_week', 'own_pc', 'logon_on_own_pc', 'logon_on_other_pc',
       'logon_on_own_pc_normal', 'logon_on_own_pc_off_hour',
       'logon_on_other_pc_normal', 'logon_on_other_pc_off_hour'],
      dtype='object')

In [22]:
merged_logon_df.drop(columns=['own_pc','logon_time','logoff_time','session_duration'],inplace=True)

In [23]:
merged_logon_df.shape

(366340, 14)

In [24]:
import pandas as pd

# Count total duplicate rows
total_duplicates = merged_logon_df.duplicated().sum()
print(f"Total duplicate rows in the dataset: {total_duplicates}")

# Count duplicates based on 'start_time', 'end_time', and 'user'
column_duplicates = merged_logon_df.duplicated(subset=['start_time', 'end_time', 'user_id']).sum()
print(f"Total duplicate rows based on 'start_time', 'end_time', and 'user_id': {column_duplicates}")

# Display duplicate rows (full duplicates)
if total_duplicates > 0:
    print("\nSome fully duplicated rows:")
    print(merged_logon_df[merged_logon_df.duplicated(keep=False)].head())

# Display duplicate rows based on 'start_time', 'end_time', and 'user'
if column_duplicates > 0:
    print("\nSome duplicate rows based on 'start_time', 'end_time', and 'user_id':")
    print(merged_logon_df[merged_logon_df.duplicated(subset=['start_time', 'end_time', 'user_id'], keep=False)].head())

# Step 1: Remove duplicates based on 'start_time', 'end_time', and 'user' first
merged_logon_df = merged_logon_df.drop_duplicates(subset=['start_time', 'end_time', 'user_id'], keep='first')

# Step 2: Remove any remaining fully duplicated rows
merged_logon_df = merged_logon_df.drop_duplicates(keep='first')

# Reset index after dropping duplicates
merged_logon_df = merged_logon_df.reset_index(drop=True)

# Final dataset shape after cleaning
print("\nDuplicates removed. Updated dataset shape:", merged_logon_df.shape)


Total duplicate rows in the dataset: 0
Total duplicate rows based on 'start_time', 'end_time', and 'user_id': 0

Duplicates removed. Updated dataset shape: (366340, 14)


In [26]:
merged_logon_df.to_csv('logon_cleaned.csv',index=False)