## Installing Packages and Importing data

In [1]:

import re

import nltk
import pandas as pd
import tensorflow as tf
from keras import layers
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\amuly\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amuly\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\amuly\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Importing the data
data = pd.read_csv("../data/data_cleaned.csv")
data.head()

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Data Transformation

This section contains the steps to transforming so that it will be easier to feed into the model

In [3]:
# Step 1: Remove Punctuation Marks and URLs
def remove_punctuation_and_urls(text):
    # Remove punctuation marks
    text = re.sub(r'[^\w\s]', '', text)

    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)

    return text


data['text'] = data['text'].apply(remove_punctuation_and_urls)

# Step 2: Remove Stop Words and Lowercase
stop_words = set(stopwords.words('english'))


def remove_stopwords_and_lowercase(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords and lowercase the tokens
    filtered_tokens = [word.lower()
                       for word in tokens if word.lower() not in stop_words]

    return ' '.join(filtered_tokens)


data['text'] = data['text'].apply(remove_stopwords_and_lowercase)

# Step 3: Tokenization, Stemming, and Lemmatization
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


def tokenize_stem_lemmatize(text):
    # Tokenize the cleaned text
    tokens = word_tokenize(text)

    # Apply stemming and lemmatization
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]

    return ' '.join(lemmatized_tokens)


data['text'] = data['text'].apply(tokenize_stem_lemmatize)

In [4]:
data.head()

Unnamed: 0,class,text
0,ham,go jurong point crazi avail bugi n great world...
1,ham,ok lar joke wif u oni
2,spam,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah dont think goe usf live around though


In [5]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.apply(self._preprocess_text)
        return X_transformed

    def _preprocess_text(self, text):
        # Remove punctuation marks and URLs
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'http\S+|www.\S+', '', text)

        # Tokenize the text
        tokens = word_tokenize(text)

        # Remove stopwords and lowercase the tokens
        filtered_tokens = [word.lower()
                           for word in tokens if word.lower() not in self.stop_words]

        # Apply stemming and lemmatization
        stemmed_tokens = [self.stemmer.stem(word) for word in filtered_tokens]
        lemmatized_tokens = [self.lemmatizer.lemmatize(
            word) for word in stemmed_tokens]

        return ' '.join(lemmatized_tokens)

In [6]:
# Fit and transform the 'text' column using the pipeline
preprocessing_pipeline = Pipeline([
    ('preprocessor', TextPreprocessor())
])

# Fit and transform the 'text' column using the pipeline
X_preprocessed = preprocessing_pipeline.fit_transform(data['text'])

In [7]:
# Transform the labels
enc = LabelEncoder()
data["class"] = enc.fit_transform(data["class"])
enc.classes_

array(['ham', 'spam'], dtype=object)

In [8]:
data["text"] = X_preprocessed

In [9]:
# splitting the data into features and labels 
features = data["text"]
labels = data["class"]

# splitting the data into train and test sets
train_sentences, test_sentences, train_labels, test_labels = train_test_split(features, labels, test_size=0.2,
                                                                              random_state=42)


## Modelling

### Tokenizing the data

In [10]:
max_vocab_length = 10000  # max number of words to have in our vocabulary
max_length = 15  # max length our sequences will be (e.g. how many words from a Tweet does our model see?)

text_vectorizer = layers.TextVectorization(max_tokens=max_vocab_length,
                                           output_mode="int",
                                           output_sequence_length=max_length)

In [11]:
# Fit the vectorizer on train sentences
text_vectorizer.adapt(train_sentences)

In [12]:
# Create sample sentence and tokenize it
sample_sentence = "There's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[   1,    1, 5383,    1,    1,  946,    0,    0,    0,    0,    0,
           0,    0,    0,    0]], dtype=int64)>

Create an Embedding layer

In [13]:
tf.random.set_seed(42)
embedding = layers.Embedding(input_dim=max_vocab_length,  # set input shape
                             output_dim=128,  # set size of embedding vector
                             embeddings_initializer="uniform",  # default, intialize randomly
                             input_length=max_length,  # how long is each input
                             name="embedding_1")

embedding

<keras.src.layers.core.embedding.Embedding at 0x2172d82fee0>

Create the first model

In [14]:
# Build model with the Functional API
inputs = layers.Input(shape=(1,), dtype="string")  # inputs are 1-dimensional strings
x = text_vectorizer(inputs)  # turn the input text into numbers
x = embedding(x)  # create an embedding of the numerized numbers
x = layers.GlobalAveragePooling1D()(
    x)  # lower the dimensionality of the embedding (try running the model without this layer and see what happens)
outputs = layers.Dense(1, activation="sigmoid")(
    x)  # create the output layer, want binary outputs so use sigmoid activation
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense")  # construct the model

In [15]:
# Compile model
model_1.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [18]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVe  (None, 15)                0         
 ctorization)                                                    
                                                                 
 embedding_1 (Embedding)     (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (  (None, 128)               0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1280129 (4.88 MB)
Trainable params: 128

In [16]:
# Fit the model
model_1.fit(train_sentences,
            train_labels,
            epochs=10)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2172d823640>

In [19]:
# Test on the test data
model_1.evaluate(test_sentences, test_labels)



[0.06188231706619263, 0.9758220314979553]

97% accuracy is pretty good so let's convert the above code into modular programming