In [1]:
import pandas as pd
device_df = pd.read_csv('device.csv')

In [2]:
device_df.head()

Unnamed: 0,id,date,user,pc,activity
0,{J1S3-L9UU75BQ-7790ATPL},01/02/2010 07:21:06,MOH0273,PC-6699,Connect
1,{N7B5-Y7BB27SI-2946PUJK},01/02/2010 07:37:41,MOH0273,PC-6699,Disconnect
2,{U1V9-Z7XT67KV-5649MYHI},01/02/2010 07:59:11,HPH0075,PC-2417,Connect
3,{H0Z7-E6GB57XZ-1603MOXD},01/02/2010 07:59:49,IIW0249,PC-0843,Connect
4,{L7P2-G4PX02RX-7999GYOY},01/02/2010 08:04:26,IIW0249,PC-0843,Disconnect


In [3]:
device_df.isnull().sum()

id          0
date        0
user        0
pc          0
activity    0
dtype: int64

In [4]:

device_df['date'] = pd.to_datetime(device_df['date'], errors='coerce')



In [5]:
device_df.isnull().sum()

id          0
date        0
user        0
pc          0
activity    0
dtype: int64

In [6]:
device_df["logon_hour"] = device_df["date"].dt.hour
device_df["day_of_week"] = device_df["date"].dt.dayofweek  # Monday = 0, Sunday = 6


In [7]:
# Determine user's own PC as the first PC they connect to (based on the first 'Connect' activity)
user_pc_mapping = device_df[device_df['activity'] == 'Connect'].groupby('user')['pc'].first().to_dict()

# Add the mapping to the DataFrame
device_df["own_pc"] = device_df["user"].map(user_pc_mapping)


In [8]:
NORMAL_HOURS_START = 8  # 8 AM
NORMAL_HOURS_END = 18   # 6 PM

# Determine logon type: Own PC vs. Other PC
device_df["connect_on_own_pc"] = (device_df["pc"] == device_df["own_pc"]).astype(int)
device_df["connect_on_other_pc"] = (device_df["pc"] != device_df["own_pc"]).astype(int)

# Create the feature for device connection during normal hours and off-hours
device_df["device_connects_on_own_pc_normal_hour"] = (
    (device_df["logon_hour"] >= NORMAL_HOURS_START) & 
    (device_df["logon_hour"] < NORMAL_HOURS_END) & 
    (device_df["connect_on_own_pc"] == 1) & 
    (device_df["activity"] == "Connect")
).astype(int)

device_df["device_connects_on_other_pc_normal_hour"] = (
    (device_df["logon_hour"] >= NORMAL_HOURS_START) & 
    (device_df["logon_hour"] < NORMAL_HOURS_END) & 
    (device_df["connect_on_other_pc"] == 1) & 
    (device_df["activity"] == "Connect")
).astype(int)

device_df["device_connects_on_own_pc_off_hour"] = (
    (device_df["logon_hour"] < NORMAL_HOURS_START) | 
    (device_df["logon_hour"] >= NORMAL_HOURS_END) & 
    (device_df["connect_on_own_pc"] == 1) & 
    (device_df["activity"] == "Connect")
).astype(int)

device_df["device_connects_on_other_pc_off_hour"] = (
    (device_df["logon_hour"] < NORMAL_HOURS_START) | 
    (device_df["logon_hour"] >= NORMAL_HOURS_END) & 
    (device_df["connect_on_other_pc"] == 1) & 
    (device_df["activity"] == "Connect")
).astype(int)


In [9]:
device_df.drop(columns=['id','activity','logon_hour','own_pc','connect_on_own_pc','connect_on_other_pc'], inplace=True)

In [10]:
device_df.isnull().sum()

date                                       0
user                                       0
pc                                         0
day_of_week                                0
device_connects_on_own_pc_normal_hour      0
device_connects_on_other_pc_normal_hour    0
device_connects_on_own_pc_off_hour         0
device_connects_on_other_pc_off_hour       0
dtype: int64

In [11]:
device_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 405380 entries, 0 to 405379
Data columns (total 8 columns):
 #   Column                                   Non-Null Count   Dtype         
---  ------                                   --------------   -----         
 0   date                                     405380 non-null  datetime64[ns]
 1   user                                     405380 non-null  object        
 2   pc                                       405380 non-null  object        
 3   day_of_week                              405380 non-null  int32         
 4   device_connects_on_own_pc_normal_hour    405380 non-null  int64         
 5   device_connects_on_other_pc_normal_hour  405380 non-null  int64         
 6   device_connects_on_own_pc_off_hour       405380 non-null  int64         
 7   device_connects_on_other_pc_off_hour     405380 non-null  int64         
dtypes: datetime64[ns](1), int32(1), int64(4), object(2)
memory usage: 23.2+ MB


In [12]:
device_df.head()

Unnamed: 0,date,user,pc,day_of_week,device_connects_on_own_pc_normal_hour,device_connects_on_other_pc_normal_hour,device_connects_on_own_pc_off_hour,device_connects_on_other_pc_off_hour
0,2010-01-02 07:21:06,MOH0273,PC-6699,5,0,0,1,1
1,2010-01-02 07:37:41,MOH0273,PC-6699,5,0,0,1,1
2,2010-01-02 07:59:11,HPH0075,PC-2417,5,0,0,1,1
3,2010-01-02 07:59:49,IIW0249,PC-0843,5,0,0,1,1
4,2010-01-02 08:04:26,IIW0249,PC-0843,5,0,0,0,0


In [13]:

# Sort by user_id and logon_time
device_df = device_df.sort_values(by=['user', 'date']).reset_index(drop=True)

In [14]:
device_df.shape

(405380, 8)

In [15]:
import pandas as pd

# Count total duplicate rows
total_duplicates = device_df.duplicated().sum()
print(f"Total duplicate rows in the dataset: {total_duplicates}")

# Count duplicates based on 'user' and 'date'
column_duplicates = device_df.duplicated(subset=['user', 'date']).sum()
print(f"Total duplicate rows based on 'user' and 'date': {column_duplicates}")

# Display duplicate rows (full duplicates)
if total_duplicates > 0:
    print("\nSome fully duplicated rows:")
    #print(device_df[device_df.duplicated(keep=False)].head())

# Display duplicate rows based on 'user' and 'date'
if column_duplicates > 0:
    print("\nSome duplicate rows based on 'user' and 'date':")
    #print(device_df[device_df.duplicated(subset=['user', 'date'], keep=False)].head())

# Step 1: Remove duplicates based on 'user' and 'date' first
device_df = device_df.drop_duplicates(subset=['user', 'date'], keep='first')

# Step 2: Remove any remaining fully duplicated rows
device_df = device_df.drop_duplicates(keep='first')

# Reset index after dropping duplicates
device_df = device_df.reset_index(drop=True)

# Final dataset shape after cleaning
print("\nDuplicates removed. Updated dataset shape:", device_df.shape)


Total duplicate rows in the dataset: 88
Total duplicate rows based on 'user' and 'date': 361

Some fully duplicated rows:

Some duplicate rows based on 'user' and 'date':

Duplicates removed. Updated dataset shape: (405019, 8)


In [16]:
device_df.shape

(405019, 8)

In [18]:
device_df.to_csv('device_cleaned.csv', index=False)