In [1]:
import pandas as pd
email_df=pd.read_csv('email.csv')

In [2]:
email_df.isnull().sum()

id                   0
date                 0
user                 0
pc                   0
to                   0
cc             1617054
bcc            2212977
from                 0
size                 0
attachments          0
content              0
dtype: int64

In [3]:
email_df['date'] = pd.to_datetime(email_df['date'], errors='coerce')



In [4]:
email_df.isnull().sum()

id                   0
date                 0
user                 0
pc                   0
to                   0
cc             1617054
bcc            2212977
from                 0
size                 0
attachments          0
content              0
dtype: int64

In [5]:

# Define normal working hours (8 AM to 6 PM)
work_hour_start = 8
work_hour_end = 18

# Function to identify if an email is internal or external based on domain
def is_internal(email):
    # Assuming internal domain is "dtaa.com" (adjust as per your dataset's domain)
    internal_domain = "dtaa.com"
    return internal_domain in email.lower()

# Count total emails per user
email_df['total_emails'] = email_df.groupby('user')['user'].transform('count')

# Extract internal-to-internal, internal-to-external, external-to-internal, and external-to-external emails
email_df['int_to_int_mails'] = email_df.apply(lambda row: is_internal(row['from']) and all(is_internal(email) for email in row['to'].split(';') if email), axis=1).astype(int)
email_df['int_to_out_mails'] = email_df.apply(lambda row: is_internal(row['from']) and not all(is_internal(email) for email in row['to'].split(';') if email), axis=1).astype(int)
email_df['out_to_int_mails'] = email_df.apply(lambda row: not is_internal(row['from']) and all(is_internal(email) for email in row['to'].split(';') if email), axis=1).astype(int)
email_df['out_to_out_mails'] = email_df.apply(lambda row: not is_internal(row['from']) and not all(is_internal(email) for email in row['to'].split(';') if email), axis=1).astype(int)

# Count internal and external recipients
email_df['internal_recipients'] = email_df['to'].apply(lambda x: sum(1 for email in x.split(';') if is_internal(email)))
email_df['external_recipients'] = email_df['to'].apply(lambda x: sum(1 for email in x.split(';') if not is_internal(email)))

# Count distinct BCC recipients (if any)
email_df['distinct_bcc'] = email_df['bcc'].apply(lambda x: len(set(x.split(';'))) if pd.notna(x) else 0)

# Count emails with attachments (based on 'attachments' column)
email_df['mails_with_attachments'] = email_df['attachments'].apply(lambda x: 1 if x > 0 else 0)

# Check if the email was sent after work hours (before 9 AM or after 6 PM)
email_df['after_hour_mails'] = email_df['date'].dt.hour.apply(lambda x: 1 if x < work_hour_start or x >= work_hour_end else 0)

# Print or return the dataframe with the newly extracted features
print(email_df[['total_emails', 'int_to_int_mails', 'int_to_out_mails', 'out_to_int_mails', 'out_to_out_mails', 
                'internal_recipients', 'external_recipients', 'distinct_bcc', 'mails_with_attachments', 'after_hour_mails']])


         total_emails  int_to_int_mails  int_to_out_mails  out_to_int_mails  \
0                1420                 0                 1                 0   
1                3463                 0                 0                 0   
2                1420                 0                 0                 0   
3                1420                 0                 0                 0   
4                3463                 0                 0                 0   
...               ...               ...               ...               ...   
2629974          2988                 1                 0                 0   
2629975          3025                 1                 0                 0   
2629976          3025                 0                 0                 0   
2629977          1007                 1                 0                 0   
2629978          1021                 0                 0                 0   

         out_to_out_mails  internal_recipients  ext

In [6]:
email_df.drop(columns=['id','from','to','cc','bcc','attachments','content','total_emails','size'], inplace=True)

In [7]:
email_df.isnull().sum()

date                      0
user                      0
pc                        0
int_to_int_mails          0
int_to_out_mails          0
out_to_int_mails          0
out_to_out_mails          0
internal_recipients       0
external_recipients       0
distinct_bcc              0
mails_with_attachments    0
after_hour_mails          0
dtype: int64

In [8]:
email_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2629979 entries, 0 to 2629978
Data columns (total 12 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   date                    datetime64[ns]
 1   user                    object        
 2   pc                      object        
 3   int_to_int_mails        int64         
 4   int_to_out_mails        int64         
 5   out_to_int_mails        int64         
 6   out_to_out_mails        int64         
 7   internal_recipients     int64         
 8   external_recipients     int64         
 9   distinct_bcc            int64         
 10  mails_with_attachments  int64         
 11  after_hour_mails        int64         
dtypes: datetime64[ns](1), int64(9), object(2)
memory usage: 240.8+ MB


In [9]:

# Sort by user_id and logon_time
email_df = email_df.sort_values(by=['user', 'date']).reset_index(drop=True)

In [10]:
email_df.shape

(2629979, 12)

In [11]:


# Count total duplicate rows
total_duplicates =email_df.duplicated().sum()
print(f"Total duplicate rows in the dataset: {total_duplicates}")

# Count duplicates based on 'user' and 'date'
column_duplicates = email_df.duplicated(subset=['user', 'date']).sum()
print(f"Total duplicate rows based on 'user' and 'date': {column_duplicates}")

# Display duplicate rows (full duplicates)
if total_duplicates > 0:
    print("\nSome fully duplicated rows:")
    #print(device_df[device_df.duplicated(keep=False)].head())

# Display duplicate rows based on 'user' and 'date'
if column_duplicates > 0:
    print("\nSome duplicate rows based on 'user' and 'date':")
    #print(device_df[device_df.duplicated(subset=['user', 'date'], keep=False)].head())

# Step 1: Remove duplicates based on 'user' and 'date' first
email_df = email_df.drop_duplicates(subset=['user', 'date'], keep='first')

# Step 2: Remove any remaining fully duplicated rows
email_df = email_df.drop_duplicates(keep='first')

# Reset index after dropping duplicates
email_df = email_df.reset_index(drop=True)

# Final dataset shape after cleaning
print("\nDuplicates removed. Updated dataset shape:", email_df.shape)


Total duplicate rows in the dataset: 3938
Total duplicate rows based on 'user' and 'date': 17526

Some fully duplicated rows:

Some duplicate rows based on 'user' and 'date':

Duplicates removed. Updated dataset shape: (2612453, 12)


In [12]:
email_df.to_csv('email_cleaned.csv', index=False)