In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
def load_data(path):
    df = pd.read_csv(path, 
                     delimiter='\t', 
                     names = ['label', 'message'])
    return df
    

def preprocess_data(df):
    df1 = df.copy()
    df1['label'] = df1['label'].map({'ham': 0, 'spam': 1})
    return df1
    

def split_data(df):
    split1 = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
    split2 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

    for train_index, temp_index in split1.split(df, df["label"]):
        train = df.iloc[train_index]
        temp = df.iloc[temp_index]

    for val_index, test_index in split2.split(temp, temp["label"]):
        val = temp.iloc[val_index]
        test = temp.iloc[test_index]

    return train, val, test


def save_splits(train, val, test):
    train.to_csv('train.csv', index = False)
    val.to_csv('validation.csv', index = False)
    test.to_csv('test.csv', index = False)

In [3]:
loaded_df = load_data('SMSSpamCollection')
preprocessed_df = preprocess_data(loaded_df)
train, val, test = split_data(preprocessed_df)
save_splits(train, val, test)