# Import Necessary Libraries

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

import dvc

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [34]:
# from google.colab import drive
# drive.mount('/content/drive')

# Define Necessary Functions

In [35]:
# Function to load data from a given file path
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

# Function to preprocess the data
def preprocess_data(data):
    # Lowercasing
    data['text'] = data['text'].apply(lambda x: x.lower())
    # Tokenization
    data['text'] = data['text'].apply(lambda x: word_tokenize(x))
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    data['text'] = data['text'].apply(lambda x: [word for word in x if word not in stop_words])
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    data['text'] = data['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    # Joining tokens back to string
    data['text'] = data['text'].apply(lambda x: ' '.join(x))
    return data

# Function to split the data into train/validation/test sets
def split_data(data, r_state, test_size=0.2, val_size=0.25):
    # Splitting data into train and temp (temp will be further split into validation and test)
    train_data, temp_data = train_test_split(data, test_size=test_size, random_state=r_state)
    # Splitting temp_data into validation and test
    validation_data, test_data = train_test_split(temp_data, test_size=val_size, random_state=r_state)
    return train_data, validation_data, test_data



# Function to store the splits at train.csv/validation.csv/test.csv
def store_splits(train_data, validation_data, test_data, output_path):
    train_data.to_csv(output_path + 'train.csv', index=False)
    validation_data.to_csv(output_path + 'validation.csv', index=False)
    test_data.to_csv(output_path + 'test.csv', index=False)


# Function to calculate target variable distribution of train.csv/validation.csv/test.csv
def label_dist(path):
    data = pd.read_csv(path)
    print(data["spam"].value_counts())



# Load data

In [36]:
file_path = r'Data/emails.csv'  # Update with the actual path
data = load_data(file_path)


# Preprocess data

In [37]:
processed_data= preprocess_data(data)


# First split of data using seed=42

In [38]:
train_data, validation_data, test_data = split_data(processed_data,r_state=42, test_size=0.2, val_size=0.25)


# Store Splited Data

In [39]:
store_splits(train_data, validation_data, test_data, output_path=r'Data/Splited_Data_Seed_42/')

## Initialize DVC for tracking Splitted data

In [40]:
!dvc init

Initialized DVC repository.

You can now commit the changes to git.

+---------------------------------------------------------------------+
|                                                                     |
|        DVC has enabled anonymous aggregate usage analytics.         |
|     Read the analytics documentation (and how to opt-out) here:     |
|             <https://dvc.org/doc/user-guide/analytics>              |
|                                                                     |
+---------------------------------------------------------------------+

What's next?
------------


- Check out the documentation: <https://dvc.org/doc>
- Get help and share ideas: <https://dvc.org/chat>
- Star us on GitHub: <https://github.com/iterative/dvc>


Adding all the 3 splitted csv files to dvc

In [41]:
!dvc add Data/Splited_Data_Seed_42/train.csv
!dvc add Data/Splited_Data_Seed_42/validation.csv
!dvc add Data/Splited_Data_Seed_42/test.csv




⠋ Checking graph

To track the changes with git, run:











	git add 'Data\Splited_Data_Seed_42\.gitignore' 'Data\Splited_Data_Seed_42\train.csv.dvc'

To enable auto staging, run:

	dvc config core.autostage true


⠋ Checking graph


To track the changes with git, run:







	git add 'Data\Splited_Data_Seed_42\.gitignore' 'Data\Splited_Data_Seed_42\validation.csv.dvc'

To enable auto staging, run:

	dvc config core.autostage true



⠋ Checking graph


To track the changes with git, run:






	git add 'Data\Splited_Data_Seed_42\.gitignore' 'Data\Splited_Data_Seed_42\test.csv.dvc'

To enable auto staging, run:

	dvc config core.autostage true


In [42]:
!dvc config core.autostage true

Adding google drive folder as a remote data storage

In [43]:
!dvc remote add --default myremote gdrive://14FcFV3GhBnOIiSWJAKgCglVlrAtVXxqp

Setting 'myremote' as a default remote.


In [44]:
!dvc remote modify myremote gdrive_acknowledge_abuse true

Pushing dvc tracked files to remote storage

In [None]:
!dvc push

3 files pushed


In [None]:
!dvc status

Data and pipelines are up to date.


Checkout for 1st version

In [47]:
!dvc checkout Data/Splited_Data_Seed_42/train.csv.dvc
!dvc checkout Data/Splited_Data_Seed_42/test.csv.dvc
!dvc checkout Data/Splited_Data_Seed_42/validation.csv.dvc

### The distribution of Target Variable in the splitted data
For 1st Split (Random Seed: 42)

In [48]:
train_path = r'Data/Splited_Data_Seed_42/train.csv'
val_path = r'Data/Splited_Data_Seed_42/validation.csv'
test_path = r'Data/Splited_Data_Seed_42/test.csv'

print("First Split (Random Seed: 42)")
print("\nTraining dataset:")
label_dist(train_path)
print("\nValidation dataset:")
label_dist(val_path)
print("\nTesting dataset:")
label_dist(test_path)

First Split (Random Seed: 42)

Training dataset:
0    3504
1    1078
Name: spam, dtype: int64

Validation dataset:
0    645
1    214
Name: spam, dtype: int64

Testing dataset:
0    211
1     76
Name: spam, dtype: int64



# Second split of data using seed=476

In [50]:
train_data_2, validation_data_2, test_data_2 = split_data(processed_data,r_state=476, test_size=0.2, val_size=0.25)


# Store Splited Data

In [51]:
store_splits(train_data_2, validation_data_2, test_data_2, output_path=r'Data/Splited_Data_Seed_476/')

Adding all the 3 splitted csv files to dvc

In [52]:
!dvc add Data/Splited_Data_Seed_476/train.csv
!dvc add Data/Splited_Data_Seed_476/validation.csv
!dvc add Data/Splited_Data_Seed_476/test.csv

⠋ Checking graph

⠋ Checking graph

⠋ Checking graph



In [53]:
!dvc config core.autostage true

Pushing dvc tracked files to remote storage

In [None]:
!dvc push

3 files pushed


In [None]:
!dvc status

Data and pipelines are up to date.


Checkout for 1st version

In [56]:
!dvc checkout Data/Splited_Data_Seed_476/train.csv.dvc
!dvc checkout Data/Splited_Data_Seed_476/test.csv.dvc
!dvc checkout Data/Splited_Data_Seed_476/validation.csv.dvc

### The distribution of Target Variable in the splitted data
For 2nd Split (Random Seed: 476)

In [57]:
train_path = r'Data/Splited_Data_Seed_476/train.csv'
val_path = r'Data/Splited_Data_Seed_476/validation.csv'
test_path = r'Data/Splited_Data_Seed_476/test.csv'

print("First Split (Random Seed: 476)")
print("\nTraining dataset:")
label_dist(train_path)
print("\nValidation dataset:")
label_dist(val_path)
print("\nTesting dataset:")
label_dist(test_path)

First Split (Random Seed: 476)

Training dataset:
0    3503
1    1079
Name: spam, dtype: int64

Validation dataset:
0    635
1    224
Name: spam, dtype: int64

Testing dataset:
0    222
1     65
Name: spam, dtype: int64
