# Import Necessary Libraries

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Mount Google drive

In [7]:
# from google.colab import drive
# drive.mount('/content/drive')

# Define Necessary Functions

In [8]:
# Function to load data from a given file path
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

# Function to preprocess the data
def preprocess_data(data):
    # Lowercasing
    data['text'] = data['text'].apply(lambda x: x.lower())
    # Tokenization
    data['text'] = data['text'].apply(lambda x: word_tokenize(x))
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    data['text'] = data['text'].apply(lambda x: [word for word in x if word not in stop_words])
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    data['text'] = data['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    # Joining tokens back to string
    data['text'] = data['text'].apply(lambda x: ' '.join(x))
    return data

# Function to split the data into train/validation/test sets
def split_data(data, r_state, test_size=0.2, val_size=0.25):
    # Splitting data into train and temp (temp will be further split into validation and test)
    train_data, temp_data = train_test_split(data, test_size=test_size, random_state=r_state)
    # Splitting temp_data into validation and test
    validation_data, test_data = train_test_split(temp_data, test_size=val_size, random_state=r_state)
    return train_data, validation_data, test_data



# Function to store the splits at train.csv/validation.csv/test.csv
def store_splits(train_data, validation_data, test_data, output_path):
    train_data.to_csv(output_path + 'train.csv', index=False)
    validation_data.to_csv(output_path + 'validation.csv', index=False)
    test_data.to_csv(output_path + 'test.csv', index=False)

# Load data

In [12]:
file_path = r'Data/emails.csv'  # Update with the actual path
data = load_data(file_path)


# Preprocess data

In [13]:
processed_data= preprocess_data(data)


# Split data

In [14]:
train_data, validation_data, test_data = split_data(processed_data,r_state=121, test_size=0.2, val_size=0.25)


# Store Splited Data

In [16]:
store_splits(train_data, validation_data, test_data, output_path=r'Data/')

In [17]:
import dvc

In [18]:
!dvc init

Initialized DVC repository.

You can now commit the changes to git.

+---------------------------------------------------------------------+
|                                                                     |
|        DVC has enabled anonymous aggregate usage analytics.         |
|     Read the analytics documentation (and how to opt-out) here:     |
|             <https://dvc.org/doc/user-guide/analytics>              |
|                                                                     |
+---------------------------------------------------------------------+

What's next?


------------
- Check out the documentation: <https://dvc.org/doc>
- Get help and share ideas: <https://dvc.org/chat>
- Star us on GitHub: <https://github.com/iterative/dvc>


Adding all the 3 splitted csv files to dvc

In [20]:
!git rm -r --cached Data/train.csv
!git commit -m "stop tracking train.csv"
!git rm -r --cached Data/validation.csv
!git commit -m "stop tracking validation.csv"
!git rm -r --cached Data/test.csv
!git commit -m "stop tracking test.csv"

!dvc add Data/train.csv
!dvc add Data/validation.csv
!dvc add Data/test.csv

fatal: pathspec 'Data/train.csv' did not match any files


On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	deleted:    .gitignore
	deleted:    emails.csv
	modified:   prepare.ipynb
	deleted:    test.csv.dvc
	deleted:    train.csv.dvc
	deleted:    validation.csv.dvc

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	Data/
	mlruns/
	prep_dvc.ipynb
	prepare_SB.ipynb
	train_mlflow.ipynb

no changes added to commit (use "git add" and/or "git commit -a")


fatal: pathspec 'Data/validation.csv' did not match any files


On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	deleted:    .gitignore
	deleted:    emails.csv
	modified:   prepare.ipynb
	deleted:    test.csv.dvc
	deleted:    train.csv.dvc
	deleted:    validation.csv.dvc

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	Data/
	mlruns/
	prep_dvc.ipynb
	prepare_SB.ipynb
	train_mlflow.ipynb

no changes added to commit (use "git add" and/or "git commit -a")


fatal: pathspec 'Data/test.csv' did not match any files


On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	deleted:    .gitignore
	deleted:    emails.csv
	modified:   prepare.ipynb
	deleted:    test.csv.dvc
	deleted:    train.csv.dvc
	deleted:    validation.csv.dvc

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	Data/
	mlruns/
	prep_dvc.ipynb
	prepare_SB.ipynb
	train_mlflow.ipynb

no changes added to commit (use "git add" and/or "git commit -a")



⠋ Checking graph

To track the changes with git, run:











	git add 'Data\.gitignore' 'Data\train.csv.dvc'

To enable auto staging, run:

	dvc config core.autostage true


⠋ Checking graph







To track the changes with git, run:






	git add 'Data\validation.csv.dvc' 'Data\.gitignore'

To enable auto staging, run:

	dvc config core.autostage true



⠋ Checking graph

To track the changes with git, run:











	git add 'Data\.gitignore' 'Data\test.csv.dvc'

To enable auto staging, run:

	dvc config core.autostage true


In [21]:
!dvc config core.autostage true

Adding google drive folder as a remote data storage

In [22]:
!dvc remote add --default myremote_4 gdrive://14FcFV3GhBnOIiSWJAKgCglVlrAtVXxqp

Setting 'myremote_4' as a default remote.


In [23]:
!dvc remote modify myremote_4 gdrive_acknowledge_abuse true

Pushing dvc tracked files to remote storage

In [24]:
!dvc push

3 files pushed


In [25]:
!dvc status

Data and pipelines are up to date.


In [26]:
!dvc push

Everything is up to date.


Checkout for the different versions of the data splitting

In [27]:
!git log

commit 7953ecbdfc9277f957f0c62c95485562efadfb6c
Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sun Feb 18 18:44:18 2024 +0530

    stop tracking train.csv

commit 10f8a696a6cd61f209b1e167a46129ac2e0ae1d1
Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sun Feb 18 18:20:31 2024 +0530

    stop tracking test.csv

commit 9bcdb7c4f409b89876e164f66e8614a3f6d0f262
Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sun Feb 18 18:20:31 2024 +0530

    stop tracking validation.csv

commit 9528539ada14ae2a3dc538d23cac849b9eee24de
Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sun Feb 18 18:20:30 2024 +0530

    stop tracking train.csv

commit 3305a71763c14a67ffda9f6c26eb4287ad9d671d


Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sun Feb 18 18:14:47 2024 +0530

    stop tracking train.csv

commit 4328bb467eef37b7644efcc24e123e6cd64a9a87
Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sat Feb 17 23:53:57 2024 +0530

    stop tracking validation.csv

commit 698a4fdc086ab6e15135efc36625083d321f3b23
Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sat Feb 17 14:54:02 2024 +0530

    Assignment 2 Updates

commit bed73835f10b538ea57c1cffcd002be4637d4afc
Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sat Feb 17 14:46:24 2024 +0530

    Assignment 2 Updates

commit a9c739d3e4422cd78c2a7f23416f3745d299304e
Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sat Feb 17 14:43:46 2024 +0530

    Update

commit e990de5bf6a61e96b1e9caaf5911d8377b7aa95f
Merge: fbac1f5 0b7478d
Author: akashdas2110 <112683602+akashdas2110@users.noreply.github.com>
Date:   Sat Feb 17 14:35:20 2024 +0530

    Merge pull request #1 from akashdas2110/akashdas2110-patch-1
    
   

Checkout for 1st version

In [28]:
# !git checkout fbac1f5a0a02afe132bed3cbc293ec2bde2d40f9 

In [29]:
!dvc checkout Data/train.csv.dvc
!dvc checkout Data/test.csv.dvc
!dvc checkout Data/validation.csv.dvc

In [30]:
!git push

Everything up-to-date



# Split data

In [31]:
train_data_2, validation_data_2, test_data_2 = split_data(processed_data,r_state=476, test_size=0.2, val_size=0.25)


# Store Splited Data

In [32]:
store_splits(train_data_2, validation_data_2, test_data_2, output_path=r'Data_2/')

In [33]:
import dvc

In [34]:
# !dvc init

Adding all the 3 splitted csv files to dvc

In [35]:
!git rm -r --cached Data_2/train.csv
!git commit -m "stop tracking train.csv"
!git rm -r --cached Data_2/validation.csv
!git commit -m "stop tracking validation.csv"
!git rm -r --cached Data_2/test.csv
!git commit -m "stop tracking test.csv"

!dvc add Data_2/train.csv
!dvc add Data_2/validation.csv
!dvc add Data_2/test.csv

fatal: pathspec 'Data_2/train.csv' did not match any files


On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   .dvc/config
	deleted:    .gitignore
	deleted:    emails.csv
	modified:   prepare.ipynb
	deleted:    test.csv.dvc
	deleted:    train.csv.dvc
	deleted:    validation.csv.dvc

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	Data/
	Data_2/
	mlruns/
	prep_dvc.ipynb
	prepare_SB.ipynb
	train_mlflow.ipynb

no changes added to commit (use "git add" and/or "git commit -a")


fatal: pathspec 'Data_2/validation.csv' did not match any files


On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   .dvc/config
	deleted:    .gitignore
	deleted:    emails.csv
	modified:   prepare.ipynb
	deleted:    test.csv.dvc
	deleted:    train.csv.dvc
	deleted:    validation.csv.dvc

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	Data/
	Data_2/
	mlruns/
	prep_dvc.ipynb
	prepare_SB.ipynb
	train_mlflow.ipynb

no changes added to commit (use "git add" and/or "git commit -a")


fatal: pathspec 'Data_2/test.csv' did not match any files


On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   .dvc/config
	deleted:    .gitignore
	deleted:    emails.csv
	modified:   prepare.ipynb
	deleted:    test.csv.dvc
	deleted:    train.csv.dvc
	deleted:    validation.csv.dvc

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	Data/
	Data_2/
	mlruns/
	prep_dvc.ipynb
	prepare_SB.ipynb
	train_mlflow.ipynb

no changes added to commit (use "git add" and/or "git commit -a")


⠋ Checking graph

⠋ Checking graph

⠋ Checking graph



In [36]:
!dvc config core.autostage true

Adding google drive folder as a remote data storage

In [37]:
# !dvc remote add --default myremote_4 gdrive://14FcFV3GhBnOIiSWJAKgCglVlrAtVXxqp

In [38]:
# !dvc remote modify myremote_4 gdrive_acknowledge_abuse true

Pushing dvc tracked files to remote storage

In [39]:
!dvc push

3 files pushed


In [40]:
!dvc status

Data and pipelines are up to date.


In [41]:
!dvc push

Everything is up to date.


Checkout for the different versions of the data splitting

In [42]:
!git log

commit 7953ecbdfc9277f957f0c62c95485562efadfb6c
Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sun Feb 18 18:44:18 2024 +0530

    stop tracking train.csv

commit 10f8a696a6cd61f209b1e167a46129ac2e0ae1d1
Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sun Feb 18 18:20:31 2024 +0530

    stop tracking test.csv

commit 9bcdb7c4f409b89876e164f66e8614a3f6d0f262
Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sun Feb 18 18:20:31 2024 +0530

    stop tracking validation.csv

commit 9528539ada14ae2a3dc538d23cac849b9eee24de
Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sun Feb 18 18:20:30 2024 +0530

    stop tracking train.csv

commit 3305a71763c14a67ffda9f6c26eb4287ad9d671d
Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sun Feb 18 18:14:47 2024 +0530



    stop tracking train.csv

commit 4328bb467eef37b7644efcc24e123e6cd64a9a87
Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sat Feb 17 23:53:57 2024 +0530

    stop tracking validation.csv

commit 698a4fdc086ab6e15135efc36625083d321f3b23
Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sat Feb 17 14:54:02 2024 +0530

    Assignment 2 Updates

commit bed73835f10b538ea57c1cffcd002be4637d4afc
Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sat Feb 17 14:46:24 2024 +0530

    Assignment 2 Updates

commit a9c739d3e4422cd78c2a7f23416f3745d299304e
Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sat Feb 17 14:43:46 2024 +0530

    Update

commit e990de5bf6a61e96b1e9caaf5911d8377b7aa95f
Merge: fbac1f5 0b7478d
Author: akashdas2110 <112683602+akashdas2110@users.noreply.github.com>
Date:   Sat Feb 17 14:35:20 2024 +0530

    Merge pull request #1 from akashdas2110/akashdas2110-patch-1
    
    ass_2

commit 0b7478d8c6b4ca9b1908e115c496c86515128648
Author: akashdas2110 <1126836

Checkout for 1st version

In [43]:
# !git checkout fbac1f5a0a02afe132bed3cbc293ec2bde2d40f9 

In [44]:
!dvc checkout Data_2/train.csv.dvc
!dvc checkout Data_2/test.csv.dvc
!dvc checkout Data_2/validation.csv.dvc

In [51]:
!git push

Everything up-to-date


In [52]:
!git log

commit 7953ecbdfc9277f957f0c62c95485562efadfb6c
Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sun Feb 18 18:44:18 2024 +0530

    stop tracking train.csv

commit 10f8a696a6cd61f209b1e167a46129ac2e0ae1d1
Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sun Feb 18 18:20:31 2024 +0530

    stop tracking test.csv

commit 9bcdb7c4f409b89876e164f66e8614a3f6d0f262
Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sun Feb 18 18:20:31 2024 +0530

    stop tracking validation.csv

commit 9528539ada14ae2a3dc538d23cac849b9eee24de
Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sun Feb 18 18:20:30 2024 +0530

    stop tracking train.csv

commit 3305a71763c14a67ffda9f6c26eb4287ad9d671d


Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sun Feb 18 18:14:47 2024 +0530

    stop tracking train.csv

commit 4328bb467eef37b7644efcc24e123e6cd64a9a87
Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sat Feb 17 23:53:57 2024 +0530

    stop tracking validation.csv

commit 698a4fdc086ab6e15135efc36625083d321f3b23
Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sat Feb 17 14:54:02 2024 +0530

    Assignment 2 Updates

commit bed73835f10b538ea57c1cffcd002be4637d4afc
Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sat Feb 17 14:46:24 2024 +0530

    Assignment 2 Updates

commit a9c739d3e4422cd78c2a7f23416f3745d299304e
Author: akashdas2110 <iamakash476@gmail.com>
Date:   Sat Feb 17 14:43:46 2024 +0530

    Update

commit e990de5bf6a61e96b1e9caaf5911d8377b7aa95f
Merge: fbac1f5 0b7478d
Author: akashdas2110 <112683602+akashdas2110@users.noreply.github.com>
Date:   Sat Feb 17 14:35:20 2024 +0530

    Merge pull request #1 from akashdas2110/akashdas2110-patch-1
    
   