In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
from nltk.corpus import stopwords
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split

In [4]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Akshay
[nltk_data]     Thorave\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Akshay
[nltk_data]     Thorave\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
!git init

Reinitialized existing Git repository in C:/Users/Akshay Thorave/Desktop/AML_Assignment_2/AML_Assignment2/.git/


In [8]:
!dvc init -f

Initialized DVC repository.

You can now commit the changes to git.

+---------------------------------------------------------------------+
|                                                                     |
|        DVC has enabled anonymous aggregate usage analytics.         |
|     Read the analytics documentation (and how to opt-out) here:     |
|             <https://dvc.org/doc/user-guide/analytics>              |
|                                                                     |
+---------------------------------------------------------------------+

What's next?
------------
- Check out the documentation: <https://dvc.org/doc>
- Get help and share ideas: <https://dvc.org/chat>
- Star us on GitHub: <https://github.com/iterative/dvc>


In [9]:
!dvc config core.autostage true

In [10]:
!git status

On branch dvc
Your branch is up to date with 'origin/dvc'.

Changes to be committed:
  (use "git restore --staged <file>..." to unstage)
	new file:   .dvc/.gitignore
	new file:   .dvc/config
	new file:   .dvcignore

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   .dvc/config

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	.ipynb_checkpoints/
	Untitled.ipynb



In [5]:
data = pd.read_csv('emails.csv')

In [6]:
data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [11]:
data.isnull().sum()

text    0
spam    0
dtype: int64

In [12]:
data.duplicated().sum()

33

In [13]:
data = data.drop_duplicates(keep = 'first')

In [14]:
data['spam'].value_counts()

0    4327
1    1368
Name: spam, dtype: int64

## Data Preprocessing

In [15]:
def text_processing(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Use regex to remove all digits (numbers)
    text = re.sub(r'\d+', '', text)
    
    # Remove extra whitespaces
    text = ' '.join(text.split())
    
    # converting normalized text into tokens
    tokens = nltk.word_tokenize(text)
    
    # removing stopwords
    y = []
    for token in tokens:
        if token not in stopwords.words('english'):
            y.append(token)
    
    return " ".join(y[1:])    # every row contains a word,'subject' which is useless so we are removing that word


In [16]:
data = data.copy()

data['processed_text'] = data['text'].apply(text_processing)

In [17]:
data.drop(columns = ['text'], inplace = True)
data

Unnamed: 0,spam,processed_text
0,1,naturally irresistible corporate identity lt r...
1,1,stock trading gunslinger fanny merrill muzo co...
2,1,unbelievable new homes made easy im wanting sh...
3,1,color printing special request additional info...
4,1,money get software cds software compatibility ...
...,...,...
5723,0,research development charges gpg forwarded shi...
5724,0,receipts visit jim thanks invitation visit lsu...
5725,0,enron case study update wow day super thank mu...
5726,0,interest david please call shirley crenshaw as...


## Creating First Version of data

In [18]:
X = data['processed_text']

y = data['spam']

X_train_val, X_test, y_train_val, y_test = train_test_split(X,y, test_size = 0.2, random_state = 25)

X_train, X_val, y_train, y_val = train_test_split(X_train_val,y_train_val, test_size = 0.25, random_state = 50)

In [19]:
X_train_df = pd.DataFrame()

X_train_df['Email'] = X_train
X_train_df['Spam'] = y_train
X_train_df

Unnamed: 0,Email,Spam
5469,gas model chaim received number phone messages...,0
4315,backtesting different percentiles vlady enclos...,0
3452,year end performance feedback note receive mes...,0
693,neugierig content type text plain content tran...,1
3698,garp frank reviewed materials garp find inform...,0
...,...,...
1534,friend mine shirley please arrange phone inter...,0
5460,var cob nd aug hi vince waiting comment email ...,0
3831,willow pathstar evaluations please respond mik...,0
125,cool medz hello welcome medzonli decapitation ...,1


In [20]:
X_test_df = pd.DataFrame()

X_test_df['Email'] = X_test
X_test_df['Spam'] = y_test
X_test_df

Unnamed: 0,Email,Spam
1615,forwarded vince j kaminski hou ect gould aaron...,0
611,new love tabs shop visit llcensed online drags...,1
4247,additional e mail addresses vince three new st...,0
3662,synfuel option valuation lenny believe must do...,0
3849,missing prc information vince following inform...,0
...,...,...
2533,congratulations dear vince soooo gland see get...,0
4515,additional attachments vince forgot attach fin...,0
4223,visit enron professor nalin kulatilaka boston ...,0
4011,subscription renewal barbara yes would like re...,0


In [21]:
X_val_df = pd.DataFrame()

X_val_df['Email'] = X_val
X_val_df['Spam'] = y_val
X_val_df

Unnamed: 0,Email,Spam
2074,fyi enron best hi vince spoke molly mcgee hr g...,0
4264,fw gmm mar jeff newsletter addressed wide audi...,0
5586,russian investment climate multimedia playback...,0
627,professional advertising dear projecthoneypot ...,1
3773,supply rebound beginning update cera outlook u...,0
...,...,...
5656,new eprm speakers vince thanks much help helen...,0
3808,technical writer position note confirm cease r...,0
2692,calling pm pm hi vince thank allowing call spe...,0
2720,fw london wish list oops sent previous email a...,0


In [22]:
X_train_df.to_csv('train.csv',index = False)
X_test_df.to_csv('test.csv',index = False)
X_val_df.to_csv('validation.csv',index = False)

In [23]:
!dvc add train.csv validation.csv test.csv

In [24]:
!git add train.csv.dvc validation.csv.dvc test.csv.dvc

In [25]:
!git commit -m "First version of data"

[dvc 461455a] First version of data
 7 files changed, 24 insertions(+)
 create mode 100644 .dvc/.gitignore
 create mode 100644 .dvc/config
 create mode 100644 .dvcignore
 create mode 100644 .gitignore
 create mode 100644 test.csv.dvc
 create mode 100644 train.csv.dvc
 create mode 100644 validation.csv.dvc


In [26]:
!git status

On branch dvc
Your branch is ahead of 'origin/dvc' by 1 commit.
  (use "git push" to publish your local commits)

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   .dvc/config

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	.ipynb_checkpoints/
	Untitled.ipynb

no changes added to commit (use "git add" and/or "git commit -a")


In [27]:
!git log

commit 461455a817a6b3fb4520dd36b2c5162dc15280a4
Author: Akshay Thorave <thorave@cmi.ac.in>
Date:   Tue Feb 20 22:39:28 2024 +0530

    First version of data

commit 8906d3ffe6b1a27ae6ec05a08b55127a1c47a635
Author: Akshay Thorave <thorave@cmi.ac.in>
Date:   Tue Feb 20 22:12:56 2024 +0530

    All files are removed except email

commit 0494e06f85a2e8115217afaf78a3ba30cf0d17d1
Author: Akshay Thorave <thorave@cmi.ac.in>
Date:   Tue Feb 20 21:29:02 2024 +0530

    updated

commit 42dd711aee358a7dfb2db8e9c087e19c7cbc6955
Author: Akshay Thorave <thorave@cmi.ac.in>
Date:   Tue Feb 20 20:07:57 2024 +0530

    Jupyter notebook added

commit ac0af98a77942db778ad656ecaec9cb117e47cbe
Author: Akshay Thorave <thorave@cmi.ac.in>
Date:   Tue Feb 20 14:43:14 2024 +0530

    Dataset uploaded


In [28]:
!git push

To https://github.com/akshaythorave/AML_Assignment2.git
   8906d3f..461455a  dvc -> dvc


## Updated Version of data

In [29]:
X = data['processed_text']

y = data['spam']

X_train_val, X_test, y_train_val, y_test = train_test_split(X,y, test_size = 0.2, random_state = 525)

X_train, X_val, y_train, y_val = train_test_split(X_train_val,y_train_val, test_size = 0.25, random_state = 550)

In [30]:
X_train_df = pd.DataFrame()

X_train_df['Email'] = X_train
X_train_df['Spam'] = y_train
X_train_df

Unnamed: 0,Email,Spam
3743,candlestick charts fyi fallout forwarded mike ...,0
1991,fyi,0
990,take action immediately miss attention valued ...,1
2422,ameriflash newsletter business highlights coal...,0
929,viagrra scores hello welcome pharmonlin purita...,1
...,...,...
11,save money buy getting thing tried cialls yet ...,1
4667,programming rdi model michelle met cecil chris...,0
1307,wish dd tried sooner save supper medlcations r...,1
3902,summer internship hi vince writing time inquir...,0


In [31]:
X_test_df = pd.DataFrame()

X_test_df['Email'] = X_test
X_test_df['Spam'] = y_test
X_test_df

Unnamed: 0,Email,Spam
2112,storage book ravi samer met morning sara ledbe...,0
3717,meeting wharton office next week please send d...,0
4795,enron recruiting mscf speaker series vince pie...,0
2421,research sign steve sign research group someth...,0
2097,f u iris mack mba phd enron vince iris receive...,0
...,...,...
3402,visit enron frank great idea think opportunity...,0
1472,meeting bob butts scheduled pm thursday th off...,0
1746,visit vince kaminski enron corp research dear ...,0
4206,resume mark giancola attached resume mark gian...,0


In [32]:
X_val_df = pd.DataFrame()

X_val_df['Email'] = X_val
X_val_df['Spam'] = y_val
X_val_df

Unnamed: 0,Email,Spam
134,get latest version cds download wide range sof...,1
3837,yana kristal rotation hi vince already spoke m...,0
61,select small cap astute investors momentum ale...,1
4173,power plant model jeff comments model reservat...,0
3753,bullet points hi vince thanks bullets regardin...,0
...,...,...
138,make dialup go faster visioson hpp za net find...,1
99,save money buy getting thing tried cialls yet ...,1
661,investor insight oil gas advisory oi gas enter...,1
999,percent life insurance get free quote instantl...,1


In [33]:
X_train_df.to_csv('train.csv',index = False)
X_test_df.to_csv('test.csv',index = False)
X_val_df.to_csv('validation.csv',index = False)

In [34]:
!dvc add train.csv validation.csv test.csv

In [35]:
!git add train.csv.dvc validation.csv.dvc test.csv.dvc

In [36]:
!git commit -m "Updated version of data"

[dvc 6292915] Updated version of data
 3 files changed, 6 insertions(+), 6 deletions(-)


In [37]:
!git log

commit 62929150a13323fa2e0738e53041e75de6b0a564
Author: Akshay Thorave <thorave@cmi.ac.in>
Date:   Tue Feb 20 23:03:15 2024 +0530

    Updated version of data

commit 461455a817a6b3fb4520dd36b2c5162dc15280a4
Author: Akshay Thorave <thorave@cmi.ac.in>
Date:   Tue Feb 20 22:39:28 2024 +0530

    First version of data

commit 8906d3ffe6b1a27ae6ec05a08b55127a1c47a635
Author: Akshay Thorave <thorave@cmi.ac.in>
Date:   Tue Feb 20 22:12:56 2024 +0530

    All files are removed except email

commit 0494e06f85a2e8115217afaf78a3ba30cf0d17d1
Author: Akshay Thorave <thorave@cmi.ac.in>
Date:   Tue Feb 20 21:29:02 2024 +0530

    updated

commit 42dd711aee358a7dfb2db8e9c087e19c7cbc6955
Author: Akshay Thorave <thorave@cmi.ac.in>
Date:   Tue Feb 20 20:07:57 2024 +0530

    Jupyter notebook added

commit ac0af98a77942db778ad656ecaec9cb117e47cbe
Author: Akshay Thorave <thorave@cmi.ac.in>
Date:   Tue Feb 20 14:43:14 2024 +0530

    Dataset uploaded


In [38]:
!git checkout 461455a817a6b3fb4520dd36b2c5162dc15280a4

M	.dvc/config


Note: switching to '461455a817a6b3fb4520dd36b2c5162dc15280a4'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at 461455a First version of data


In [39]:
!dvc checkout

M       validation.csv
M       train.csv
M       test.csv


In [40]:
print("Distribution of response variable in train.csv (First version of data):")
train_data = pd.read_csv('train.csv')
print(train_data['Spam'].value_counts())


print("Distribution of response variable in validation.csv (First version of data):")
validation_data = pd.read_csv('validation.csv')
print(validation_data['Spam'].value_counts())


print("Distribution of response variable in test.csv (First version of data):")
test_data = pd.read_csv('test.csv')
print(test_data['Spam'].value_counts())

Distribution of response variable in train.csv (First version of data):
0    2605
1     812
Name: Spam, dtype: int64
Distribution of response variable in validation.csv (First version of data):
0    860
1    279
Name: Spam, dtype: int64
Distribution of response variable in test.csv (First version of data):
0    862
1    277
Name: Spam, dtype: int64


In [41]:
!git checkout 62929150a13323fa2e0738e53041e75de6b0a564

M	.dvc/config


Previous HEAD position was 461455a First version of data
HEAD is now at 6292915 Updated version of data


In [42]:
!dvc checkout

M       validation.csv
M       train.csv
M       test.csv


In [43]:
print("Distribution of response variable in train.csv (First version of data):")
train_data = pd.read_csv('train.csv')
print(train_data['Spam'].value_counts())


print("Distribution of response variable in validation.csv (First version of data):")
validation_data = pd.read_csv('validation.csv')
print(validation_data['Spam'].value_counts())


print("Distribution of response variable in test.csv (First version of data):")
test_data = pd.read_csv('test.csv')
print(test_data['Spam'].value_counts())

Distribution of response variable in train.csv (First version of data):
0    2595
1     822
Name: Spam, dtype: int64
Distribution of response variable in validation.csv (First version of data):
0    864
1    275
Name: Spam, dtype: int64
Distribution of response variable in test.csv (First version of data):
0    868
1    271
Name: Spam, dtype: int64
