# Import Necessary Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Mount Google drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Define Necessary Functions

In [3]:
# Function to load data from a given file path
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

# Function to preprocess the data
def preprocess_data(data):
    # Lowercasing
    data['text'] = data['text'].apply(lambda x: x.lower())
    # Tokenization
    data['text'] = data['text'].apply(lambda x: word_tokenize(x))
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    data['text'] = data['text'].apply(lambda x: [word for word in x if word not in stop_words])
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    data['text'] = data['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    # Joining tokens back to string
    data['text'] = data['text'].apply(lambda x: ' '.join(x))
    return data

# Function to split the data into train/validation/test sets
def split_data(data, test_size=0.2, val_size=0.25):
    # Splitting data into train and temp (temp will be further split into validation and test)
    train_data, temp_data = train_test_split(data, test_size=test_size, random_state=42)
    # Splitting temp_data into validation and test
    validation_data, test_data = train_test_split(temp_data, test_size=val_size, random_state=42)
    return train_data, validation_data, test_data



# Function to store the splits at train.csv/validation.csv/test.csv
def store_splits(train_data, validation_data, test_data, output_path):
    train_data.to_csv(output_path + 'train.csv', index=False)
    validation_data.to_csv(output_path + 'validation.csv', index=False)
    test_data.to_csv(output_path + 'test.csv', index=False)

# Load data

In [4]:
file_path = '/content/drive/MyDrive/Applied_ML_A1/emails.csv'  # Update with the actual path
data = load_data(file_path)


# Preprocess data

In [5]:
processed_data= preprocess_data(data)


# Split data

In [6]:
train_data, validation_data, test_data = split_data(processed_data, test_size=0.2, val_size=0.25)


# Store Splited Data

In [7]:
store_splits(train_data, validation_data, test_data, output_path='/content/drive/MyDrive/Applied_ML_A1/')