In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from sklearn.model_selection import train_test_split



In [2]:
df = pd.read_csv("emails.csv")

In [3]:
df["text"] = df['text'].astype(str)
df['spam'] = df['spam'].astype(float)

### Preprocessing: 

In [4]:
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Join the tokens back into a single string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
df['text'] = df['text'].apply(preprocess_text)

### Splitting and saving data with random seed 42:

In [6]:
def split_dataframe(dataframe, random_state, test_size=0.3, validation_size=0.5):
    # Split into training and temporary set
    train_set, temp_set = train_test_split(dataframe, test_size=test_size, random_state=random_state)
    
    # Further split temporary set into test and validation sets
    test_set, validation_set = train_test_split(temp_set, test_size=validation_size, random_state=random_state)

    return train_set, test_set, validation_set    
    

In [7]:
train, test, validation = split_dataframe(df, random_state = 42)

In [8]:
train.to_csv('train.csv', index=False)
validation.to_csv('validation.csv', index=False)
test.to_csv('test.csv', index=False)

### Implementing DVC:

In [9]:
!dvc init --no-scm --f
!git init

Initialized DVC repository.

+---------------------------------------------------------------------+
|                                                                     |
|        DVC has enabled anonymous aggregate usage analytics.         |
|     Read the analytics documentation (and how to opt-out) here:     |
|             <https://dvc.org/doc/user-guide/analytics>              |
|                                                                     |
+---------------------------------------------------------------------+

What's next?
------------
- Check out the documentation: <https://dvc.org/doc>
- Get help and share ideas: <https://dvc.org/chat>
- Star us on GitHub: <https://github.com/iterative/dvc>
Initialized empty Git repository in D:/Cmi/Applied ML/Assignment_2/.git/


In [10]:
!git add train.csv test.csv validation.csv
!git commit -m "Added train, test and validation data after splitting with random seed = 42"


[master (root-commit) 27e0ade] Added train, test and validation data after splitting with random seed = 42
 3 files changed, 5731 insertions(+)
 create mode 100644 test.csv
 create mode 100644 train.csv
 create mode 100644 validation.csv


### Splitting with random seed 150:

In [11]:
train, test, validation = split_dataframe(df, random_state = 100)

In [12]:
train.to_csv('train.csv', index=False)
validation.to_csv('validation.csv', index=False)
test.to_csv('test.csv', index=False)

In [14]:
! git add train.csv test.csv validation.csv
! git commit -m "Added new train, test and validation data with random seed = 150"

On branch master
Untracked files:
  (use "git add <file>..." to include in what will be committed)
	.dvc/
	.dvcignore
	Assignment_2.ipynb
	emails.csv

nothing added to commit but untracked files present (use "git add" to track)


In [15]:
! git log --oneline 

9c482c5 Added new train, test and validation data with random seed = 100
27e0ade Added train, test and validation data after splitting with random seed = 42


### Distribution of target variables:

#### Random seed = 42:

In [16]:
! git checkout 27e0ade

Note: switching to '27e0ade'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at 27e0ade Added train, test and validation data after splitting with random seed = 42


In [17]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
validation = pd.read_csv("validation.csv")

In [18]:
def display_spam_counts(df, name):
    counts = df['spam'].value_counts()
    print(f"{name} DataFrame:")
    print(counts)
    print()

In [19]:
display_spam_counts(train, 'Train')
display_spam_counts(test, 'Test')
display_spam_counts(validation, 'Validation')

Train DataFrame:
spam
0.0    3082
1.0     927
Name: count, dtype: int64

Test DataFrame:
spam
0.0    645
1.0    214
Name: count, dtype: int64

Validation DataFrame:
spam
0.0    633
1.0    227
Name: count, dtype: int64



#### Random seed = 100:

In [20]:
! git checkout 9c482c5

Previous HEAD position was 27e0ade Added train, test and validation data after splitting with random seed = 42
HEAD is now at 9c482c5 Added new train, test and validation data with random seed = 100


In [21]:
t = pd.read_csv("train.csv")
te = pd.read_csv("test.csv")
va = pd.read_csv("validation.csv")

In [22]:
display_spam_counts(t, 'Train')
display_spam_counts(te, 'Test')
display_spam_counts(va, 'Validation')

Train DataFrame:
spam
0.0    3029
1.0     980
Name: count, dtype: int64

Test DataFrame:
spam
0.0    665
1.0    194
Name: count, dtype: int64

Validation DataFrame:
spam
0.0    666
1.0    194
Name: count, dtype: int64

