In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.utils import resample

In [3]:
def load_data(csv_file):
    """
    Load a CSV file into a df.
    """
    try:
        df = pd.read_csv(csv_file)

        print("Sample of 5 rows from the DataFrame:")
        print(df.sample(5))
        
        print("\nShape of the DataFrame:", df.shape)

        return df
        
    except (FileNotFoundError, pd.errors.EmptyDataError, pd.errors.ParserError) as e:
        print(f"Error: {e}")

In [4]:
def split_data(raw_data, random_state=101):
    """
    Split the data into Training, Validation, and Test sets.
    """
    X_train, X_test, y_train, y_test = train_test_split(raw_data.text, raw_data['spam'], test_size=0.15, random_state=random_state)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=random_state)
    
    return X_train, X_val, X_test, y_train, y_val, y_test

In [5]:
def save_data(X_train, X_val, X_test, y_train, y_val, y_test):
    """
    Save the split datasets to CSV files.
    """
    df_train = pd.DataFrame(list(zip(X_train, y_train)), columns=["text", 'spam'])
    df_test = pd.DataFrame(list(zip(X_test, y_test)), columns=["text", 'spam'])
    df_val = pd.DataFrame(list(zip(X_val, y_val)), columns=["text", 'spam'])
    
    df_val.to_csv('Data/Validation Data.csv', index=False) 
    df_test.to_csv('Data/Test Data.csv', index=False) 
    df_train.to_csv('Data/Training Data.csv', index=False) 

In [6]:
def git_dvc_setup():
    """
    Set up Git and DVC to track data.
    """
    ! pip install dvc
    ! git init
    ! dvc init

In [None]:
def track_data(commit_message):
    """
    Track data using DVC and Git.
    """
    ! dvc add Data
    ! git add Data.dvc
    ! git commit -m "{commit_message}"
    ! dvc remote add -d storage gdrive://1egzIHd_vdcJDZBtnH3VlRhVcnwazkPvs
    ! dvc push

In [7]:
def checkout_version(version_number):
    """
    Checkout a specific version of the data using DVC.
    """
    ! git checkout HEAD Data.dvc 
    ! dvc checkout

In [9]:
def load_and_check_data(train_path, val_path, test_path):
    '''
    Load data from CSV files and print the counts of each label in the training, validation, and test sets.
    '''

    print('\nTrain set:')
    train = pd.read_csv(train_path)
    print('no. of non-spam:', train['spam'].value_counts()[0], '\nno. of spam:', train['spam'].value_counts()[1])

    print('\nValidation set:')
    val = pd.read_csv(val_path)
    print('no. of non-spam:', val['spam'].value_counts()[0], '\nno. of spam:', val['spam'].value_counts()[1])

    print('\nTest set:')
    test = pd.read_csv(test_path)
    print('no. of non-spam:', test['spam'].value_counts()[0], '\nno. of spam:', test['spam'].value_counts()[1])