Link to dataset: https://www.kaggle.com/code/egenius/starter-cert-insider-threat-cfc11c55-1/input

<h1>Imported Libraries</h1>

In [35]:
import pandas as pd
import re
import string
import nltk
from scipy import stats
from nltk.tokenize import sent_tokenize
from sklearn.preprocessing import StandardScaler

# Download the Punkt Tokenizer Models (only need to do this once)
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\OnilChibaya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

<h1>Preprocessing</h1>

<h4>Import Dataset</h4>

In [36]:
dataset = pd.read_csv('email.csv')
dataset.head(5)

Unnamed: 0,id,date,user,pc,to,cc,bcc,from,size,attachments,content
0,{R3I7-S4TX96FG-8219JWFF},01/02/2010 07:11:45,LAP0338,PC-5758,Dean.Flynn.Hines@dtaa.com;Wade_Harrison@lockhe...,Nathaniel.Hunter.Heath@dtaa.com,,Lynn.Adena.Pratt@dtaa.com,25830,0,middle f2 systems 4 july techniques powerful d...
1,{R0R9-E4GL59IK-2907OSWJ},01/02/2010 07:12:16,MOH0273,PC-6699,Odonnell-Gage@bellsouth.net,,,MOH68@optonline.net,29942,0,the breaking called allied reservations former...
2,{G2B2-A8XY58CP-2847ZJZL},01/02/2010 07:13:00,LAP0338,PC-5758,Penelope_Colon@netzero.com,,,Lynn_A_Pratt@earthlink.net,28780,0,slowly this uncinus winter beneath addition ex...
3,{A3A9-F4TH89AA-8318GFGK},01/02/2010 07:13:17,LAP0338,PC-5758,Judith_Hayden@comcast.net,,,Lynn_A_Pratt@earthlink.net,21907,0,400 other difficult land cirrocumulus powered ...
4,{E8B7-C8FZ88UF-2946RUQQ},01/02/2010 07:13:28,MOH0273,PC-6699,Bond-Raymond@verizon.net;Alea_Ferrell@msn.com;...,,Odonnell-Gage@bellsouth.net,MOH68@optonline.net,17319,0,this kmh october holliswood number advised unu...


<h4>Pre-preprocessing</h4>

Below we convert categorical data to numeric data and restructuring the dataset before diving in to the preprocessing of the dataset.

In [37]:
# Split the 'datetime' column into 'date' and 'time' columns
split_df = dataset['date'].str.split(' ', expand=True)
dataset['date'] = split_df[0]
dataset['time'] = split_df[1].fillna('00:00:00')  # Fill missing time values with a default time if needed
dataset.head()

Unnamed: 0,id,date,user,pc,to,cc,bcc,from,size,attachments,content,time
0,{R3I7-S4TX96FG-8219JWFF},01/02/2010,LAP0338,PC-5758,Dean.Flynn.Hines@dtaa.com;Wade_Harrison@lockhe...,Nathaniel.Hunter.Heath@dtaa.com,,Lynn.Adena.Pratt@dtaa.com,25830,0,middle f2 systems 4 july techniques powerful d...,07:11:45
1,{R0R9-E4GL59IK-2907OSWJ},01/02/2010,MOH0273,PC-6699,Odonnell-Gage@bellsouth.net,,,MOH68@optonline.net,29942,0,the breaking called allied reservations former...,07:12:16
2,{G2B2-A8XY58CP-2847ZJZL},01/02/2010,LAP0338,PC-5758,Penelope_Colon@netzero.com,,,Lynn_A_Pratt@earthlink.net,28780,0,slowly this uncinus winter beneath addition ex...,07:13:00
3,{A3A9-F4TH89AA-8318GFGK},01/02/2010,LAP0338,PC-5758,Judith_Hayden@comcast.net,,,Lynn_A_Pratt@earthlink.net,21907,0,400 other difficult land cirrocumulus powered ...,07:13:17
4,{E8B7-C8FZ88UF-2946RUQQ},01/02/2010,MOH0273,PC-6699,Bond-Raymond@verizon.net;Alea_Ferrell@msn.com;...,,Odonnell-Gage@bellsouth.net,MOH68@optonline.net,17319,0,this kmh october holliswood number advised unu...,07:13:28


In [38]:
# Convert email addresses to the number of addresses in each row
dataset['to'] = dataset['to'].str.count(';') + 1
dataset.head()

Unnamed: 0,id,date,user,pc,to,cc,bcc,from,size,attachments,content,time
0,{R3I7-S4TX96FG-8219JWFF},01/02/2010,LAP0338,PC-5758,2,Nathaniel.Hunter.Heath@dtaa.com,,Lynn.Adena.Pratt@dtaa.com,25830,0,middle f2 systems 4 july techniques powerful d...,07:11:45
1,{R0R9-E4GL59IK-2907OSWJ},01/02/2010,MOH0273,PC-6699,1,,,MOH68@optonline.net,29942,0,the breaking called allied reservations former...,07:12:16
2,{G2B2-A8XY58CP-2847ZJZL},01/02/2010,LAP0338,PC-5758,1,,,Lynn_A_Pratt@earthlink.net,28780,0,slowly this uncinus winter beneath addition ex...,07:13:00
3,{A3A9-F4TH89AA-8318GFGK},01/02/2010,LAP0338,PC-5758,1,,,Lynn_A_Pratt@earthlink.net,21907,0,400 other difficult land cirrocumulus powered ...,07:13:17
4,{E8B7-C8FZ88UF-2946RUQQ},01/02/2010,MOH0273,PC-6699,3,,Odonnell-Gage@bellsouth.net,MOH68@optonline.net,17319,0,this kmh october holliswood number advised unu...,07:13:28


In [39]:
# Convert email addresses to the number of addresses in each row
dataset['cc'] = dataset['cc'].apply(lambda x: x.count(';') + 1 if pd.notna(x) else 0)
dataset.head()

Unnamed: 0,id,date,user,pc,to,cc,bcc,from,size,attachments,content,time
0,{R3I7-S4TX96FG-8219JWFF},01/02/2010,LAP0338,PC-5758,2,1,,Lynn.Adena.Pratt@dtaa.com,25830,0,middle f2 systems 4 july techniques powerful d...,07:11:45
1,{R0R9-E4GL59IK-2907OSWJ},01/02/2010,MOH0273,PC-6699,1,0,,MOH68@optonline.net,29942,0,the breaking called allied reservations former...,07:12:16
2,{G2B2-A8XY58CP-2847ZJZL},01/02/2010,LAP0338,PC-5758,1,0,,Lynn_A_Pratt@earthlink.net,28780,0,slowly this uncinus winter beneath addition ex...,07:13:00
3,{A3A9-F4TH89AA-8318GFGK},01/02/2010,LAP0338,PC-5758,1,0,,Lynn_A_Pratt@earthlink.net,21907,0,400 other difficult land cirrocumulus powered ...,07:13:17
4,{E8B7-C8FZ88UF-2946RUQQ},01/02/2010,MOH0273,PC-6699,3,0,Odonnell-Gage@bellsouth.net,MOH68@optonline.net,17319,0,this kmh october holliswood number advised unu...,07:13:28


In [40]:
# Convert email addresses to the number of addresses in each row
dataset['bcc'] = dataset['bcc'].apply(lambda x: x.count(';') + 1 if pd.notna(x) else 0)
dataset.head()

Unnamed: 0,id,date,user,pc,to,cc,bcc,from,size,attachments,content,time
0,{R3I7-S4TX96FG-8219JWFF},01/02/2010,LAP0338,PC-5758,2,1,0,Lynn.Adena.Pratt@dtaa.com,25830,0,middle f2 systems 4 july techniques powerful d...,07:11:45
1,{R0R9-E4GL59IK-2907OSWJ},01/02/2010,MOH0273,PC-6699,1,0,0,MOH68@optonline.net,29942,0,the breaking called allied reservations former...,07:12:16
2,{G2B2-A8XY58CP-2847ZJZL},01/02/2010,LAP0338,PC-5758,1,0,0,Lynn_A_Pratt@earthlink.net,28780,0,slowly this uncinus winter beneath addition ex...,07:13:00
3,{A3A9-F4TH89AA-8318GFGK},01/02/2010,LAP0338,PC-5758,1,0,0,Lynn_A_Pratt@earthlink.net,21907,0,400 other difficult land cirrocumulus powered ...,07:13:17
4,{E8B7-C8FZ88UF-2946RUQQ},01/02/2010,MOH0273,PC-6699,3,0,1,MOH68@optonline.net,17319,0,this kmh october holliswood number advised unu...,07:13:28


In [41]:
# Convert the text in 'from' to lowercase
dataset['from'] = dataset['from'].str.lower()
dataset.head()

Unnamed: 0,id,date,user,pc,to,cc,bcc,from,size,attachments,content,time
0,{R3I7-S4TX96FG-8219JWFF},01/02/2010,LAP0338,PC-5758,2,1,0,lynn.adena.pratt@dtaa.com,25830,0,middle f2 systems 4 july techniques powerful d...,07:11:45
1,{R0R9-E4GL59IK-2907OSWJ},01/02/2010,MOH0273,PC-6699,1,0,0,moh68@optonline.net,29942,0,the breaking called allied reservations former...,07:12:16
2,{G2B2-A8XY58CP-2847ZJZL},01/02/2010,LAP0338,PC-5758,1,0,0,lynn_a_pratt@earthlink.net,28780,0,slowly this uncinus winter beneath addition ex...,07:13:00
3,{A3A9-F4TH89AA-8318GFGK},01/02/2010,LAP0338,PC-5758,1,0,0,lynn_a_pratt@earthlink.net,21907,0,400 other difficult land cirrocumulus powered ...,07:13:17
4,{E8B7-C8FZ88UF-2946RUQQ},01/02/2010,MOH0273,PC-6699,3,0,1,moh68@optonline.net,17319,0,this kmh october holliswood number advised unu...,07:13:28


In [42]:
# Convert the text in 'content' to lowercase
dataset['content'] = dataset['content'].str.lower()
dataset.head()

Unnamed: 0,id,date,user,pc,to,cc,bcc,from,size,attachments,content,time
0,{R3I7-S4TX96FG-8219JWFF},01/02/2010,LAP0338,PC-5758,2,1,0,lynn.adena.pratt@dtaa.com,25830,0,middle f2 systems 4 july techniques powerful d...,07:11:45
1,{R0R9-E4GL59IK-2907OSWJ},01/02/2010,MOH0273,PC-6699,1,0,0,moh68@optonline.net,29942,0,the breaking called allied reservations former...,07:12:16
2,{G2B2-A8XY58CP-2847ZJZL},01/02/2010,LAP0338,PC-5758,1,0,0,lynn_a_pratt@earthlink.net,28780,0,slowly this uncinus winter beneath addition ex...,07:13:00
3,{A3A9-F4TH89AA-8318GFGK},01/02/2010,LAP0338,PC-5758,1,0,0,lynn_a_pratt@earthlink.net,21907,0,400 other difficult land cirrocumulus powered ...,07:13:17
4,{E8B7-C8FZ88UF-2946RUQQ},01/02/2010,MOH0273,PC-6699,3,0,1,moh68@optonline.net,17319,0,this kmh october holliswood number advised unu...,07:13:28


In [43]:
# Rearrange columns
dataset = dataset[['id', 'pc', 'user', 'date', 'time', 'from', 'to', 'cc', 'bcc', 'size', 'attachments', 'content']]
dataset.head()

Unnamed: 0,id,pc,user,date,time,from,to,cc,bcc,size,attachments,content
0,{R3I7-S4TX96FG-8219JWFF},PC-5758,LAP0338,01/02/2010,07:11:45,lynn.adena.pratt@dtaa.com,2,1,0,25830,0,middle f2 systems 4 july techniques powerful d...
1,{R0R9-E4GL59IK-2907OSWJ},PC-6699,MOH0273,01/02/2010,07:12:16,moh68@optonline.net,1,0,0,29942,0,the breaking called allied reservations former...
2,{G2B2-A8XY58CP-2847ZJZL},PC-5758,LAP0338,01/02/2010,07:13:00,lynn_a_pratt@earthlink.net,1,0,0,28780,0,slowly this uncinus winter beneath addition ex...
3,{A3A9-F4TH89AA-8318GFGK},PC-5758,LAP0338,01/02/2010,07:13:17,lynn_a_pratt@earthlink.net,1,0,0,21907,0,400 other difficult land cirrocumulus powered ...
4,{E8B7-C8FZ88UF-2946RUQQ},PC-6699,MOH0273,01/02/2010,07:13:28,moh68@optonline.net,3,0,1,17319,0,this kmh october holliswood number advised unu...


<h1>Preprocessing</h1>

convert date and time to numerical data for the standardization and normalization process.

In [44]:
# Remove '/' from the 'date'
dataset['date'] = dataset['date'].str.replace('/', '')
dataset

Unnamed: 0,id,pc,user,date,time,from,to,cc,bcc,size,attachments,content
0,{R3I7-S4TX96FG-8219JWFF},PC-5758,LAP0338,01022010,07:11:45,lynn.adena.pratt@dtaa.com,2,1,0,25830,0,middle f2 systems 4 july techniques powerful d...
1,{R0R9-E4GL59IK-2907OSWJ},PC-6699,MOH0273,01022010,07:12:16,moh68@optonline.net,1,0,0,29942,0,the breaking called allied reservations former...
2,{G2B2-A8XY58CP-2847ZJZL},PC-5758,LAP0338,01022010,07:13:00,lynn_a_pratt@earthlink.net,1,0,0,28780,0,slowly this uncinus winter beneath addition ex...
3,{A3A9-F4TH89AA-8318GFGK},PC-5758,LAP0338,01022010,07:13:17,lynn_a_pratt@earthlink.net,1,0,0,21907,0,400 other difficult land cirrocumulus powered ...
4,{E8B7-C8FZ88UF-2946RUQQ},PC-6699,MOH0273,01022010,07:13:28,moh68@optonline.net,3,0,1,17319,0,this kmh october holliswood number advised unu...
...,...,...,...,...,...,...,...,...,...,...,...,...
2629974,{L7U1-S3KK01XW-3123TPUC},PC-1117,HRL0540,05162011,20:54:18,hedwig.regina.livingston@dtaa.com,2,1,0,33088,0,history designed stephen degree ignore them ad...
2629975,{Q4Y3-X9HV77CL-3290SOZH},PC-4973,LAF0991,05162011,20:54:43,lucas.ahmed.ferrell@dtaa.com,1,2,0,33249,1,prince prince ahmose ahmose ankh prince prince...
2629976,{L6B7-F7RU89CD-0355JGGE},PC-4973,LAF0991,05162011,21:08:12,ferrell.lucas@sbcglobal.net,2,0,0,41336,1,lifted documents 65 declined revival 14 unprec...
2629977,{F4R9-W0OQ41HB-0157JSUE},PC-1397,JMW0638,05162011,21:15:35,jonah.merritt.wilder@dtaa.com,1,0,0,39908,0,their official holmes face arranged among priz...


In [45]:
# Remove ':' from the 'time'
dataset['time'] = dataset['time'].str.replace(':', '')
dataset

Unnamed: 0,id,pc,user,date,time,from,to,cc,bcc,size,attachments,content
0,{R3I7-S4TX96FG-8219JWFF},PC-5758,LAP0338,01022010,071145,lynn.adena.pratt@dtaa.com,2,1,0,25830,0,middle f2 systems 4 july techniques powerful d...
1,{R0R9-E4GL59IK-2907OSWJ},PC-6699,MOH0273,01022010,071216,moh68@optonline.net,1,0,0,29942,0,the breaking called allied reservations former...
2,{G2B2-A8XY58CP-2847ZJZL},PC-5758,LAP0338,01022010,071300,lynn_a_pratt@earthlink.net,1,0,0,28780,0,slowly this uncinus winter beneath addition ex...
3,{A3A9-F4TH89AA-8318GFGK},PC-5758,LAP0338,01022010,071317,lynn_a_pratt@earthlink.net,1,0,0,21907,0,400 other difficult land cirrocumulus powered ...
4,{E8B7-C8FZ88UF-2946RUQQ},PC-6699,MOH0273,01022010,071328,moh68@optonline.net,3,0,1,17319,0,this kmh october holliswood number advised unu...
...,...,...,...,...,...,...,...,...,...,...,...,...
2629974,{L7U1-S3KK01XW-3123TPUC},PC-1117,HRL0540,05162011,205418,hedwig.regina.livingston@dtaa.com,2,1,0,33088,0,history designed stephen degree ignore them ad...
2629975,{Q4Y3-X9HV77CL-3290SOZH},PC-4973,LAF0991,05162011,205443,lucas.ahmed.ferrell@dtaa.com,1,2,0,33249,1,prince prince ahmose ahmose ankh prince prince...
2629976,{L6B7-F7RU89CD-0355JGGE},PC-4973,LAF0991,05162011,210812,ferrell.lucas@sbcglobal.net,2,0,0,41336,1,lifted documents 65 declined revival 14 unprec...
2629977,{F4R9-W0OQ41HB-0157JSUE},PC-1397,JMW0638,05162011,211535,jonah.merritt.wilder@dtaa.com,1,0,0,39908,0,their official holmes face arranged among priz...


Converting the PC- column to niumeric by removing the PC- string and keeping the PC value only.

In [46]:
# Remove the "PC-" prefix
dataset['pc'] = dataset['pc'].str.replace('PC-', '')
dataset

Unnamed: 0,id,pc,user,date,time,from,to,cc,bcc,size,attachments,content
0,{R3I7-S4TX96FG-8219JWFF},5758,LAP0338,01022010,071145,lynn.adena.pratt@dtaa.com,2,1,0,25830,0,middle f2 systems 4 july techniques powerful d...
1,{R0R9-E4GL59IK-2907OSWJ},6699,MOH0273,01022010,071216,moh68@optonline.net,1,0,0,29942,0,the breaking called allied reservations former...
2,{G2B2-A8XY58CP-2847ZJZL},5758,LAP0338,01022010,071300,lynn_a_pratt@earthlink.net,1,0,0,28780,0,slowly this uncinus winter beneath addition ex...
3,{A3A9-F4TH89AA-8318GFGK},5758,LAP0338,01022010,071317,lynn_a_pratt@earthlink.net,1,0,0,21907,0,400 other difficult land cirrocumulus powered ...
4,{E8B7-C8FZ88UF-2946RUQQ},6699,MOH0273,01022010,071328,moh68@optonline.net,3,0,1,17319,0,this kmh october holliswood number advised unu...
...,...,...,...,...,...,...,...,...,...,...,...,...
2629974,{L7U1-S3KK01XW-3123TPUC},1117,HRL0540,05162011,205418,hedwig.regina.livingston@dtaa.com,2,1,0,33088,0,history designed stephen degree ignore them ad...
2629975,{Q4Y3-X9HV77CL-3290SOZH},4973,LAF0991,05162011,205443,lucas.ahmed.ferrell@dtaa.com,1,2,0,33249,1,prince prince ahmose ahmose ankh prince prince...
2629976,{L6B7-F7RU89CD-0355JGGE},4973,LAF0991,05162011,210812,ferrell.lucas@sbcglobal.net,2,0,0,41336,1,lifted documents 65 declined revival 14 unprec...
2629977,{F4R9-W0OQ41HB-0157JSUE},1397,JMW0638,05162011,211535,jonah.merritt.wilder@dtaa.com,1,0,0,39908,0,their official holmes face arranged among priz...


In [47]:
# Rearrange columns
dataset = dataset[['id', 'user', 'from', 'pc','date', 'time', 'to', 'cc', 'bcc', 'size', 'attachments', 'content']]
dataset.head()

Unnamed: 0,id,user,from,pc,date,time,to,cc,bcc,size,attachments,content
0,{R3I7-S4TX96FG-8219JWFF},LAP0338,lynn.adena.pratt@dtaa.com,5758,1022010,71145,2,1,0,25830,0,middle f2 systems 4 july techniques powerful d...
1,{R0R9-E4GL59IK-2907OSWJ},MOH0273,moh68@optonline.net,6699,1022010,71216,1,0,0,29942,0,the breaking called allied reservations former...
2,{G2B2-A8XY58CP-2847ZJZL},LAP0338,lynn_a_pratt@earthlink.net,5758,1022010,71300,1,0,0,28780,0,slowly this uncinus winter beneath addition ex...
3,{A3A9-F4TH89AA-8318GFGK},LAP0338,lynn_a_pratt@earthlink.net,5758,1022010,71317,1,0,0,21907,0,400 other difficult land cirrocumulus powered ...
4,{E8B7-C8FZ88UF-2946RUQQ},MOH0273,moh68@optonline.net,6699,1022010,71328,3,0,1,17319,0,this kmh october holliswood number advised unu...


Feature Engineering

In [48]:
# Character Count:
dataset['char_count'] = dataset['content'].apply(len)

# Word Count:
dataset['word_count'] = dataset['content'].apply(lambda x: len(x.split()))

# Average Word Length:
dataset['avg_word_length'] = dataset['content'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()) if len(x.split()) > 0 else 0)

# Counting the number of sentences
dataset['sentence_count'] = dataset['content'].apply(lambda x: len(sent_tokenize(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['char_count'] = dataset['content'].apply(len)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['word_count'] = dataset['content'].apply(lambda x: len(x.split()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['avg_word_length'] = dataset['content'].apply(lambda x: sum(len(word

In [49]:
# Rearrange columns
dataset = dataset[['id', 'user', 'from', 'pc','date', 'time', 'to', 'cc', 'bcc', 'size', 'attachments', 'char_count', 'word_count', 'avg_word_length', 'sentence_count', 'content']]
dataset.head()

Unnamed: 0,id,user,from,pc,date,time,to,cc,bcc,size,attachments,char_count,word_count,avg_word_length,sentence_count,content
0,{R3I7-S4TX96FG-8219JWFF},LAP0338,lynn.adena.pratt@dtaa.com,5758,1022010,71145,2,1,0,25830,0,415,63,5.603175,1,middle f2 systems 4 july techniques powerful d...
1,{R0R9-E4GL59IK-2907OSWJ},MOH0273,moh68@optonline.net,6699,1022010,71216,1,0,0,29942,0,307,43,6.162791,1,the breaking called allied reservations former...
2,{G2B2-A8XY58CP-2847ZJZL},LAP0338,lynn_a_pratt@earthlink.net,5758,1022010,71300,1,0,0,28780,0,279,40,6.0,1,slowly this uncinus winter beneath addition ex...
3,{A3A9-F4TH89AA-8318GFGK},LAP0338,lynn_a_pratt@earthlink.net,5758,1022010,71317,1,0,0,21907,0,359,49,6.346939,1,400 other difficult land cirrocumulus powered ...
4,{E8B7-C8FZ88UF-2946RUQQ},MOH0273,moh68@optonline.net,6699,1022010,71328,3,0,1,17319,0,364,50,6.3,1,this kmh october holliswood number advised unu...


Standardization

Before standardizing the data, we have to convert the pc, date and time varialbles to type int64

In [50]:
# Convert column 'pc', 'date' and time to int64
dataset['pc'] = dataset['pc'].astype('int64')

dataset['date'] = dataset['date'].astype('int64')

dataset['time'] = dataset['time'].astype('int64')
dataset.head()

Unnamed: 0,id,user,from,pc,date,time,to,cc,bcc,size,attachments,char_count,word_count,avg_word_length,sentence_count,content
0,{R3I7-S4TX96FG-8219JWFF},LAP0338,lynn.adena.pratt@dtaa.com,5758,1022010,71145,2,1,0,25830,0,415,63,5.603175,1,middle f2 systems 4 july techniques powerful d...
1,{R0R9-E4GL59IK-2907OSWJ},MOH0273,moh68@optonline.net,6699,1022010,71216,1,0,0,29942,0,307,43,6.162791,1,the breaking called allied reservations former...
2,{G2B2-A8XY58CP-2847ZJZL},LAP0338,lynn_a_pratt@earthlink.net,5758,1022010,71300,1,0,0,28780,0,279,40,6.0,1,slowly this uncinus winter beneath addition ex...
3,{A3A9-F4TH89AA-8318GFGK},LAP0338,lynn_a_pratt@earthlink.net,5758,1022010,71317,1,0,0,21907,0,359,49,6.346939,1,400 other difficult land cirrocumulus powered ...
4,{E8B7-C8FZ88UF-2946RUQQ},MOH0273,moh68@optonline.net,6699,1022010,71328,3,0,1,17319,0,364,50,6.3,1,this kmh october holliswood number advised unu...


In [51]:
# Identify numeric columns
numeric_columns = dataset.select_dtypes(include=['float64', 'int64']).columns

# Standardize numeric columns
scaler = StandardScaler()
dataset[numeric_columns] = scaler.fit_transform(dataset[numeric_columns])
dataset.head()

Unnamed: 0,id,user,from,pc,date,time,to,cc,bcc,size,attachments,char_count,word_count,avg_word_length,sentence_count,content
0,{R3I7-S4TX96FG-8219JWFF},LAP0338,lynn.adena.pratt@dtaa.com,0.225155,-1.350869,-1.776521,0.391517,0.657642,-0.422412,-0.416497,-0.38441,0.799004,1.347242,-1.226924,0.0,middle f2 systems 4 july techniques powerful d...
1,{R0R9-E4GL59IK-2907OSWJ},MOH0273,moh68@optonline.net,0.544371,-1.350869,-1.774182,-0.695123,-0.68733,-0.422412,-0.005035,-0.38441,-0.62978,-0.649797,-0.042513,0.0,the breaking called allied reservations former...
2,{G2B2-A8XY58CP-2847ZJZL},LAP0338,lynn_a_pratt@earthlink.net,0.225155,-1.350869,-1.771414,-0.695123,-0.68733,-0.422412,-0.121309,-0.38441,-1.000205,-0.949353,-0.387054,0.0,slowly this uncinus winter beneath addition ex...
3,{A3A9-F4TH89AA-8318GFGK},LAP0338,lynn_a_pratt@earthlink.net,0.225155,-1.350869,-1.770854,-0.695123,-0.68733,-0.422412,-0.809047,-0.38441,0.058153,-0.050686,0.347231,0.0,400 other difficult land cirrocumulus powered ...
4,{E8B7-C8FZ88UF-2946RUQQ},MOH0273,moh68@optonline.net,0.544371,-1.350869,-1.770491,1.478157,-0.68733,2.101201,-1.268139,-0.38441,0.124301,0.049166,0.247887,0.0,this kmh october holliswood number advised unu...


Identify numerical anomalies.

In [57]:
# Calculate Z-scores
z_scores = stats.zscore(dataset.select_dtypes(include=['float64', 'int64']))

# Get boolean DataFrame indicating the presence of anomalies (using a threshold, e.g., 2)
anomalies_bool_df = pd.DataFrame(z_scores, columns=dataset.select_dtypes(include=['float64', 'int64']).columns, index=dataset.index)
anomalies_bool_df = (anomalies_bool_df > 2) | (anomalies_bool_df < -2)

# Extract anomalies
anomalies = dataset[anomalies_bool_df.any(axis=1)]

# Drop anomalies from the original dataset
df_no_anomalies = dataset.drop(anomalies.index)

# Display the datasets
print("Normal Dataset:")
print(df_no_anomalies)
print("\nAnomalous Dataset:")
print(anomalies)

Normal Dataset:
                               id     user                             from  \
0        {R3I7-S4TX96FG-8219JWFF}  LAP0338        lynn.adena.pratt@dtaa.com   
1        {R0R9-E4GL59IK-2907OSWJ}  MOH0273              moh68@optonline.net   
2        {G2B2-A8XY58CP-2847ZJZL}  LAP0338       lynn_a_pratt@earthlink.net   
3        {A3A9-F4TH89AA-8318GFGK}  LAP0338       lynn_a_pratt@earthlink.net   
5        {X8T7-A6BT54FP-7241DLBV}  HVB0037        hollee_becker@hotmail.com   
...                           ...      ...                              ...   
2629876  {P4V8-L7RO47MY-3213UULA}  AMR0400   aurelia.martina.reese@dtaa.com   
2629877  {O1Z8-G3PF06XP-5478BCXH}  IAR0694        imani.aubrey.ray@dtaa.com   
2629879  {N1S7-E0NN19WN-8990TPMW}  IYB0918  illana.y.bradshaw@earthlink.net   
2629880  {E6T8-T3NA73UB-5296YCSP}  KMB0922           benson-keegan@juno.com   
2629881  {C8H6-E1SY12PC-3605SXAV}  IYB0918  illana.yvette.bradshaw@dtaa.com   

               pc      date      ti

In [59]:
df_no_anomalies

Unnamed: 0,id,user,from,pc,date,time,to,cc,bcc,size,attachments,char_count,word_count,avg_word_length,sentence_count,content
0,{R3I7-S4TX96FG-8219JWFF},LAP0338,lynn.adena.pratt@dtaa.com,0.225155,-1.350869,-1.776521,0.391517,0.657642,-0.422412,-0.416497,-0.38441,0.799004,1.347242,-1.226924,0.0,middle f2 systems 4 july techniques powerful d...
1,{R0R9-E4GL59IK-2907OSWJ},MOH0273,moh68@optonline.net,0.544371,-1.350869,-1.774182,-0.695123,-0.687330,-0.422412,-0.005035,-0.38441,-0.629780,-0.649797,-0.042513,0.0,the breaking called allied reservations former...
2,{G2B2-A8XY58CP-2847ZJZL},LAP0338,lynn_a_pratt@earthlink.net,0.225155,-1.350869,-1.771414,-0.695123,-0.687330,-0.422412,-0.121309,-0.38441,-1.000205,-0.949353,-0.387054,0.0,slowly this uncinus winter beneath addition ex...
3,{A3A9-F4TH89AA-8318GFGK},LAP0338,lynn_a_pratt@earthlink.net,0.225155,-1.350869,-1.770854,-0.695123,-0.687330,-0.422412,-0.809047,-0.38441,0.058153,-0.050686,0.347231,0.0,400 other difficult land cirrocumulus powered ...
5,{X8T7-A6BT54FP-7241DLBV},HVB0037,hollee_becker@hotmail.com,0.978585,-1.350869,-1.695524,-0.695123,0.657642,-0.422412,1.436181,-0.38441,-0.523944,-0.150538,-1.268917,0.0,little equal k is group cannot though with lea...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2629876,{P4V8-L7RO47MY-3213UULA},AMR0400,aurelia.martina.reese@dtaa.com,-0.959435,-0.133405,1.982302,-0.695123,-0.687330,-0.422412,-0.018044,-0.38441,-0.047683,0.049166,-0.302396,0.0,ten classify huolongjing prematurely m198 13 r...
2629877,{O1Z8-G3PF06XP-5478BCXH},IAR0694,imani.aubrey.ray@dtaa.com,-0.776929,-0.133405,1.983620,-0.695123,-0.687330,-0.422412,0.670694,-0.38441,0.468267,-0.050686,1.686223,0.0,non ironically pay virtually amongst preventin...
2629879,{N1S7-E0NN19WN-8990TPMW},IYB0918,illana.y.bradshaw@earthlink.net,0.347617,-0.133405,1.991924,0.391517,-0.687330,-0.422412,-0.196257,-0.38441,-0.735616,-0.849501,0.284022,0.0,determined now 75 scene successful functioning...
2629880,{E6T8-T3NA73UB-5296YCSP},KMB0922,benson-keegan@juno.com,-1.142958,-0.133405,1.993472,-0.695123,0.657642,-0.422412,-0.667957,-0.38441,0.071383,0.149018,-0.221057,0.0,best than anton husbands drawn serf themes beh...
