In [1]:
!pip install dvc

Collecting dvc
  Downloading dvc-3.59.1-py3-none-any.whl.metadata (18 kB)
Collecting celery (from dvc)
  Downloading celery-5.4.0-py3-none-any.whl.metadata (21 kB)
Collecting colorama>=0.3.9 (from dvc)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting configobj>=5.0.9 (from dvc)
  Downloading configobj-5.0.9-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting dpath<3,>=2.1.0 (from dvc)
  Downloading dpath-2.2.0-py3-none-any.whl.metadata (15 kB)
Collecting dulwich (from dvc)
  Downloading dulwich-0.22.8-py3-none-any.whl.metadata (4.9 kB)
Collecting dvc-data<3.17,>=3.16.2 (from dvc)
  Downloading dvc_data-3.16.9-py3-none-any.whl.metadata (5.0 kB)
Collecting dvc-http>=2.29.0 (from dvc)
  Downloading dvc_http-2.32.0-py3-none-any.whl.metadata (1.3 kB)
Collecting dvc-objects (from dvc)
  Downloading dvc_objects-5.1.0-py3-none-any.whl.metadata (3.7 kB)
Collecting dvc-render<2,>=1.0.1 (from dvc)
  Downloading dvc_render-1.0.2-py3-none-any.whl.metadata (5.4 kB)
Collect

In [2]:
!git init
!dvc init

Initialized empty Git repository in /Users/ananyasinha/Desktop/AML/AppliedMachineLearning/Assignment 2/.git/
Initialized DVC repository.

You can now commit the changes to git.

[31m+---------------------------------------------------------------------+
[0m[31m|[0m                                                                     [31m|[0m
[31m|[0m        DVC has enabled anonymous aggregate usage analytics.         [31m|[0m
[31m|[0m     Read the analytics documentation (and how to opt-out) here:     [31m|[0m
[31m|[0m             <[36mhttps://dvc.org/doc/user-guide/analytics[39m>              [31m|[0m
[31m|[0m                                                                     [31m|[0m
[31m+---------------------------------------------------------------------+
[0m
[33mWhat's next?[39m
[33m------------[39m
- Check out the documentation: <[36mhttps://dvc.org/doc[39m>
- Get help and share ideas: <[36mhttps://dvc.org/chat[39m>
- Star us on GitHub: <[36mht

In [3]:
import os
import pandas as pd

# Define paths
os.makedirs("data/raw", exist_ok=True)
raw_data_path = os.path.join("data", "raw", "raw_data.csv")

def load_and_save_raw_data(file_path, save_path):
    df = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'text'])
    df['label'] = df['label'].map({'ham': 0, 'spam': 1})  # Convert labels to binary (0/1)
    
    # Save raw data
    df.to_csv(save_path, index=False)
    return df

data_file_path = os.path.join("data", "raw", "SMSSpamCollection.txt")  # Update path if needed
df = load_and_save_raw_data(data_file_path, raw_data_path)

print(f"Raw data saved at: {raw_data_path}")

Raw data saved at: data/raw/raw_data.csv


In [4]:
from sklearn.model_selection import train_test_split

# Create directories
os.makedirs("data/processed", exist_ok=True)

# Define paths
train_path = os.path.join("data", "processed", "train.csv")
val_path = os.path.join("data", "processed", "val.csv")
test_path = os.path.join("data", "processed", "test.csv")

def split_and_save_data(df, train_path, val_path, test_path, seed=42):
    X_train, X_temp, y_train, y_temp = train_test_split(df['text'], df['label'], test_size=0.3, random_state=seed)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=seed)

    pd.concat([X_train, y_train], axis=1).to_csv(train_path, index=False)
    pd.concat([X_val, y_val], axis=1).to_csv(val_path, index=False)
    pd.concat([X_test, y_test], axis=1).to_csv(test_path, index=False)

    print(f"Data split completed with random state {seed}")

split_and_save_data(df, train_path, val_path, test_path, seed=42)

Data split completed with random state 42


In [5]:
!dvc add data/processed/train.csv
!dvc add data/processed/val.csv
!dvc add data/processed/test.csv

!git add data/processed/train.csv.dvc
!git add data/processed/val.csv.dvc
!git add data/processed/test.csv.dvc
!git add .gitignore

!git commit -m "Initial data split with random_state=42"

[?25l[32m⠋[0m Checking graph                                       core[39m>
Adding...                                                                       
![A
Collecting files and computing hashes in data/processed/train.csv |0.00 [00:00, [A
                                                                                [A
![A
  0% Checking cache in '/Users/ananyasinha/Desktop/AML/AppliedMachineLearning/As[A
                                                                                [A
![A
  0%|          |Adding data/processed/train.csv to cac0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |Checking out /Users/ananyasinha/Deskto0/1 [00:00<?,    ?files/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00, 14.37file/s][A

To track the changes with git, run:

	git add data/processed/train.csv.dvc data/processed/.gitignore

To enable auto staging, run:

	dvc config c

In [6]:
def print_target_distribution(file_path):
    df = pd.read_csv(file_path)
    print(f"\nDistribution in {file_path}:")
    print(df['label'].value_counts())

print_target_distribution(train_path)
print_target_distribution(val_path)
print_target_distribution(test_path)


Distribution in data/processed/train.csv:
label
0    3377
1     523
Name: count, dtype: int64

Distribution in data/processed/val.csv:
label
0    724
1    112
Name: count, dtype: int64

Distribution in data/processed/test.csv:
label
0    724
1    112
Name: count, dtype: int64


In [None]:
# Change the random seed and update the dataset
new_seed = 4200
split_and_save_data(df, train_path, val_path, test_path, seed=new_seed)

# Track new version
!dvc add data/processed/train.csv
!dvc add data/processed/val.csv
!dvc add data/processed/test.csv

!git add data/processed/train.csv.dvc
!git add data/processed/val.csv.dvc
!git add data/processed/test.csv.dvc

!git commit -m "Updated data split with random_state=4200"

Data split completed with random state 4200
[?25l[32m⠋[0m Checking graph                                       core[39m>
Adding...                                                                       
![A
Collecting files and computing hashes in data/processed/train.csv |0.00 [00:00, [A
                                                                                [A
![A
  0% Checking cache in '/Users/ananyasinha/Desktop/AML/AppliedMachineLearning/As[A
                                                                                [A
![A
  0%|          |Adding data/processed/train.csv to cac0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |Checking out /Users/ananyasinha/Deskto0/1 [00:00<?,    ?files/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00, 32.77file/s][A

To track the changes with git, run:

	git add data/processed/train.csv.dvc

To enable auto staging, r

In [8]:
# Checkout the first version (random_state=42)
!git checkout HEAD~1  # Moves one commit back
!dvc checkout

# Print the distribution of the first version
print_target_distribution(train_path)
print_target_distribution(val_path)
print_target_distribution(test_path)

# Checkout the updated version (random_state=4200)
!git checkout main
!dvc checkout

# Print the distribution of the updated version
print_target_distribution(train_path)
print_target_distribution(val_path)
print_target_distribution(test_path)

Note: switching to 'HEAD~1'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at b71c80a Initial data split with random_state=42
Building workspace index                              |5.00 [00:00,  245entry/s]
Comparing indexes                                    |6.00 [00:00, 1.08kentry/s]
Applying changes                                      |3.00 [00:00, 1.05kfile/s]
[33mM[0m       data/processed/test.csv
[33mM[0m       data/processed/train.csv
[33mM[0m       data/processed/val.csv
[0m
Distribution in data

In [9]:
!dvc status

Data and pipelines are up to date.                                              
[0m

In [10]:
!git add prepare.ipynb
!git commit -m "Added data preparation notebook"

[main e6d9d47] Added data preparation notebook
 Committer: Ananya Sinha <ananyasinha@Ananyas-MacBook-Pro.local>
Your name and email address were configured automatically based
on your username and hostname. Please check that they are accurate.
You can suppress this message by setting them explicitly. Run the
following command and follow the instructions in your editor to edit
your configuration file:

    git config --global --edit

After doing this, you may fix the identity used for this commit with:

    git commit --amend --reset-author

 1 file changed, 377 insertions(+)
 create mode 100644 prepare.ipynb
