In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Scikit-learn preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.porter import PorterStemmer

# Visualization
import seaborn as sns

# String module
import string

import os

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/paulanwesha01/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/paulanwesha01/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
def load_data(csv_file):

    try:
        df = pd.read_csv(csv_file)

        print("Sample of 5 rows from the DataFrame:")
        print(df.sample(5))
        
        print("\nShape of the DataFrame:", df.shape)

        return df
        
    except (FileNotFoundError, pd.errors.EmptyDataError, pd.errors.ParserError) as e:
        print(f"Error: {e}")


In [3]:
def data_cleaning(df):

    # Encode the 'spam' column using LabelEncoder
    encoder = LabelEncoder()
    df['spam'] = encoder.fit_transform(df['spam'])

    # Check for missing values and print their sums
    print("Missing values:\n", df.isnull().sum())

    # Check for duplicate rows, print their sum, and remove duplicates
    print("\nDuplicate rows:", df.duplicated().sum())
    df = df.drop_duplicates(keep='first')

    # Print the shape of the DataFrame after cleaning
    print("Shape of the DataFrame after cleaning:", df.shape)

    return df

In [4]:
def EDA(df):

    # Tokenize the input text using nltk's word_tokenize.
    def tokenize_text(text):

        return nltk.word_tokenize(text)

    # Apply stemming to a list of tokens using PorterStemmer.
    def stem_text(tokens):
        ps = PorterStemmer()

        return [ps.stem(token) for token in tokens]

    # Calculate the number of characters in the input text.
    def calculate_num_characters(text):

        return len(text)

    # Calculate the number of words in the input text.
    def calculate_num_words(text):

        return len(tokenize_text(text))

    # Calculate the number of sentences in the input text.
    def calculate_num_sentences(text):

        return len(nltk.sent_tokenize(text))

    def visualize_distribution(df, column, xlabel):

        plt.figure(figsize=(8, 4))

        sns.histplot(df[df['spam'] == 0][column], label='Non-Spam', color='steelblue', alpha=0.7)
        sns.histplot(df[df['spam'] == 1][column], label='Spam', color='red', alpha=0.7)

        plt.legend()
        plt.xlabel(xlabel)
        plt.ylabel('Frequency')
        plt.title(f'Distribution of {xlabel} in Spam and Non-Spam Emails')
        
        plt.show()

    try:
        # Tokenize and stem the text column
        df['tokens'] = df['text'].apply(tokenize_text)
        df['stemmed_tokens'] = df['tokens'].apply(stem_text)
        
        # Calculate number of characters, words, and sentences
        df['num_characters'] = df['text'].apply(calculate_num_characters)
        df['num_words'] = df['text'].apply(calculate_num_words)
        df['num_sentences'] = df['text'].apply(calculate_num_sentences)
        
        # Display descriptive statistics
        print("\nOverall Descriptive Statistics:")
        print(df[['num_characters', 'num_words', 'num_sentences']].describe())
        
        print("\nDescriptive Statistics for Non-Spam Emails:")
        print(df[df['spam'] == 0][['num_characters', 'num_words', 'num_sentences']].describe())
        
        print("\nDescriptive Statistics for Spam Emails:")
        print(df[df['spam'] == 1][['num_characters', 'num_words', 'num_sentences']].describe())
        
        # Visualize distributions
        visualize_distribution(df, 'num_characters', "Number of Characters")
        visualize_distribution(df, 'num_words', "Number of Words")
        visualize_distribution(df, 'num_sentences', "Number of Sentences")
        
    except Exception as e:
        print(f"Error occurred during EDA: {e}")


In [5]:
def data_preprocessing(df):
    ps = PorterStemmer()

    def transform_text(text):
        # Convert text to lowercase
        text = text.lower()

        # Tokenize the text
        text = nltk.word_tokenize(text)

        y = [i for i in text if i.isalnum()]

        text = y[:]
        y.clear()

        # Filter out stopwords and punctuation
        for i in text:
            if i not in stopwords.words('english') and i not in string.punctuation:
                y.append(i)

        text = y[:]
        y.clear()

        # Apply stemming
        for i in text:
            y.append(ps.stem(i))

        return " ".join(y)

    df['transformed_text'] = df['text'].apply(transform_text)
    return df


**NOTE:** Functions for train-test split or train-validation-test split are defined as nested functions within relevant functions in train.ipynb.