In [5]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
!git init
!dvc init

Reinitialized existing Git repository in D:/AppliedML/Assignment2/.git/
Initialized DVC repository.

You can now commit the changes to git.

+---------------------------------------------------------------------+
|                                                                     |
|        DVC has enabled anonymous aggregate usage analytics.         |
|     Read the analytics documentation (and how to opt-out) here:     |
|             <https://dvc.org/doc/user-guide/analytics>              |
|                                                                     |
+---------------------------------------------------------------------+

What's next?
------------
- Check out the documentation: <https://dvc.org/doc>
- Get help and share ideas: <https://dvc.org/chat>
- Star us on GitHub: <https://github.com/treeverse/dvc>


In [5]:
import urllib.request
import zipfile

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
zip_path = "sms.zip"

urllib.request.urlretrieve(url, zip_path)

with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall("data_raw")

In [2]:
data = pd.read_csv(
    "data_raw/SMSSpamCollection",
    sep="\t",
    header=None,
    names=["label", "text"]
)

data["label"] = data["label"].map({"ham": 0, "spam": 1})

os.makedirs("Data", exist_ok=True)
data.to_csv("Data/raw_data.csv", index=False)

data.head()


Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [12]:

import re


def load_data(path):
    df = pd.read_csv(path, sep="\t", header=None, names=["label", "text"])
    return df


def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def preprocess(df):
    df = df.copy()
    df["text"] = df["text"].apply(clean_text)
    df["label"] = df["label"].map({"ham": 0, "spam": 1})
    return df


In [28]:
def split_and_save(df, seed=42):
    train, temp = train_test_split(
        df, train_size=0.7, random_state=seed
    )

    val, test = train_test_split(
        temp,
        train_size=0.5,
        random_state=seed,
    )

    os.makedirs("Data", exist_ok=True)

    df.to_csv("Data/raw_data.csv", index=False)
    train.to_csv("Data/train.csv", index=False)
    val.to_csv("Data/validation.csv", index=False)
    test.to_csv("Data/test.csv", index=False)

    return train, val, test


In [29]:
df = load_data("Data/SMSSpamCollection")
df = preprocess(df)

train_df, val_df, test_df = split_and_save(df, seed=77)

print("Version 1 created")


Version 1 created


In [30]:
!dvc add Data/raw_data.csv
!dvc add Data/train.csv Data/validation.csv Data/test.csv

!git add .
!git commit -m "dataset version 1"


To track the changes with git, run:

	git add 'Data\raw_data.csv.dvc'

To enable auto staging, run:

	dvc config core.autostage true


⠋ Checking graph




To track the changes with git, run:

	git add 'Data\train.csv.dvc' 'Data\test.csv.dvc' 'Data\validation.csv.dvc'

To enable auto staging, run:

	dvc config core.autostage true


⠋ Checking graph

The file will have its original line endings in your working directory


[master f53eeee] dataset version 1
 4 files changed, 210 insertions(+), 30 deletions(-)


In [31]:
train_df, val_df, test_df = split_and_save(df, seed=21)

print("Version 2 created")

Version 2 created


In [32]:
!dvc add Data/train.csv Data/validation.csv Data/test.csv
!git add .
!git commit -m "dataset version 2"


To track the changes with git, run:

	git add 'Data\test.csv.dvc' 'Data\train.csv.dvc' 'Data\validation.csv.dvc'

To enable auto staging, run:

	dvc config core.autostage true


⠋ Checking graph

The file will have its original line endings in your working directory


[master 337d0fa] dataset version 2
 4 files changed, 42 insertions(+), 43 deletions(-)


In [33]:
!dvc status

Data and pipelines are up to date.


In [34]:
!git stash
!git checkout HEAD~1
!dvc checkout

Saved working directory and index state WIP on master: 337d0fa dataset version 2


The file will have its original line endings in your working directory
Note: switching to 'HEAD~1'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at f53eeee dataset version 1


M       Data\test.csv
M       Data\train.csv
M       Data\validation.csv


In [35]:
for f in ["train", "validation", "test"]:
    d = pd.read_csv(f"Data/{f}.csv")
    print("OLD", f, d["label"].value_counts().to_dict())

OLD train {0: 3389, 1: 511}
OLD validation {0: 713, 1: 123}
OLD test {0: 723, 1: 113}


In [36]:
!git stash

Saved working directory and index state WIP on (no branch): f53eeee dataset version 1


The file will have its original line endings in your working directory


In [37]:
!git checkout master
!dvc checkout

Previous HEAD position was f53eeee dataset version 1
Switched to branch 'master'


M       Data\test.csv
M       Data\train.csv
M       Data\validation.csv


In [38]:
for f in ["train", "validation", "test"]:
    d = pd.read_csv(f"Data/{f}.csv")
    print("NEW", f, d["label"].value_counts().to_dict())

NEW train {0: 3374, 1: 526}
NEW validation {0: 723, 1: 113}
NEW test {0: 728, 1: 108}
