in prepare.ipynb write the functions to 
load the data from a given file path
preprocess the data (if needed)
split the data into train/validation/test 
store the splits at train.csv/validation.csv/test.csv


In [56]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk


def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

def preprocess_data(data):
    lemmatizer = WordNetLemmatizer()
   
    data['text'] = data['text'].apply(lambda x: x.lower())      # Convert text to lowercase
    data['text'] = data['text'].apply(word_tokenize)   # Tokenize the text
    data['text'] = data['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    data['text'] = data['text'].apply(lambda x: ' '.join(x)) # Join the tokens back into a single string
    data['text'] = data['text'].str.replace('[^\w\s]', '', regex = True)# Remove punctuation
    data['text'] = data['text'].str.replace('\d+', '', regex = True) # Remove numbers
    return data

def split_data(data):
    X = data['text']
    y = data['spam']
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
    train_data = pd.DataFrame({'text': X_train, 'label': y_train})
    val_data = pd.DataFrame({'text': X_val, 'label': y_val})
    test_data = pd.DataFrame({'text': X_test, 'label': y_test})
    return train_data, val_data, test_data

def store_data(train_data, val_data, test_data):
    train_data.to_csv('train.csv', index=False)
    val_data.to_csv('validation.csv', index=False)
    test_data.to_csv('test.csv', index=False)

# Load data
data = load_data('emails.csv')

# Preprocess data
data = preprocess_data(data)

# Split data
train_data, val_data, test_data = split_data(data)

# Store data
store_data(train_data, val_data, test_data)
