In [None]:
import pandas as pd
data = pd.read_csv("spam.csv",encoding='ISO-8859-1')

In [None]:
data.head()

In [None]:
data.isnull().sum()

# Remove Punctuation

In [None]:
import string
string.punctuation

In [None]:
punctuationfree = ' '
def remove_punctuation(text):
    punctuationfree = ''.join([i for i in text if i not in string.punctuation])
    return punctuationfree

In [None]:
data.head()

# Lowercase Conversion

In [None]:
data['v2_no_punctuation'] = data['v2'].apply(lambda x: remove_punctuation(x))
data['msg_lower'] = data['v2_no_punctuation'].apply(lambda x:x.lower())

In [None]:
data.head()

# Word Tokenization

In [None]:
import nltk
def tokenization(text):
  words = nltk.word_tokenize(text)
  return words

In [None]:
nltk.download('punkt_tab')

In [None]:
data['msg_tokenzied'] = data['msg_lower'].apply(lambda x: tokenization(x))

In [None]:
data.head()

# removal of stop words

In [None]:
nltk.download('stopwords')

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
def remove_stopwords(text):
  output = [i for i in text if i not in stopwords]
  return output

In [None]:
data['no_stopwords'] = data['msg_tokenzied'].apply(lambda x:remove_stopwords(x))

In [None]:
data.head()

In [None]:
#print(stopwords)

# Stemming

In [None]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [None]:
def stemming(text):
  stem_text = [ps.stem(word) for word in text]
  return stem_text

In [None]:
data['msg_stemmed'] = data['no_stopwords'].apply(lambda x:stemming(x))

In [None]:
data.head()

# Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('wordnet')

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()
def lemma(text):
  lemma_text = [wordnet_lemmatizer.lemmatize(x)for x in text]
  return lemma_text

In [None]:
data['msg_lemmatized'] = data['msg_stemmed'].apply(lambda x:lemma(x))

In [None]:
data.head()

In [None]:
data['clean_txt'] = data['msg_lemmatized'].apply(lambda x:" ".join(x))

In [None]:
data.head(1)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
cout_vec = count_vectorizer.fit_transform(data['clean_txt'])

In [None]:
count_vectorizer.vocabulary

In [None]:
cout_vec.toarray()

# BoW Example

In [None]:
documents=["This is the first document.",
           "This document is the second document.",
           "And this is the third "]

vocabulary = set()
for document in documents:
  for word in document.lower().split():
    vocabulary.add(word)

bow_representation = []
for document in documents:
  document_bow = []
  for word in vocabulary:
    if word in document.lower().split():
      document_bow.append(1)
    else:
      bow_representation.append(document_bow)

print("vocabulary:",list(vocabulary))
for i, doc_bow in enumerate(bow_representation):
  print(f"Document(i+1):",doc_bow)


# Task
Apply a classical machine learning model and deep learning model to NLP Spam/non-spam example. Train and evaluate the models. Select the model with best accuracy. Convert all preprocessing steps to a single function. Demonstrate the model output for a sample test input.

In [None]:
# Separate the features (clean text) and the target variable (spam/ham). Encode the target variable.
# Split the data into training and testing sets.

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

X = data['clean_txt']
y = data['v1']

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train a classical machine learning model using the Count Vectorizer output as features.
# This involves initializing a CountVectorizer, fitting and transforming the data, initializing a Naive Bayes model, and training the model.

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

model_nb = MultinomialNB()
model_nb.fit(X_train_count, y_train)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred_nb = model_nb.predict(X_test_count)

accuracy = accuracy_score(y_test, y_pred_nb)
precision = precision_score(y_test, y_pred_nb)
recall = recall_score(y_test, y_pred_nb)
f1 = f1_score(y_test, y_pred_nb)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

In [None]:
# Build and train a simple deep learning model for text classification using TensorFlow/Keras.

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Initialize Tokenizer and fit on training data
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>") # Limit vocabulary size and handle out-of-vocabulary words
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad sequences
max_length = 100 # Define maximum sequence length
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding='post', truncating='post')

# Build the deep learning model
model_dl = Sequential()
model_dl.add(Embedding(input_dim=5000, output_dim=16, input_length=max_length)) # Embedding layer
model_dl.add(LSTM(32)) # LSTM layer
model_dl.add(Dense(1, activation='sigmoid')) # Output layer for binary classification

# Compile the model
model_dl.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model_dl.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_split=0.2)

In [None]:
loss, accuracy_dl = model_dl.evaluate(X_test_padded, y_test, verbose=0)
print(f"Deep Learning Model Accuracy: {accuracy_dl:.4f}")

In [None]:
#  Compare the performance metrics of the Naive Bayes model and the deep learning model.

print("Classical Machine Learning Model (Naive Bayes) Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print("\nDeep Learning Model Performance:")
print(f"Accuracy: {accuracy_dl:.4f}")

In [None]:
# Define a function that encapsulates all the preprocessing steps.

import string
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

ps = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
stopwords_english = stopwords.words('english')

def preprocess_text(text):
    # Remove punctuation
    punctuationfree = ''.join([i for i in text if i not in string.punctuation])

    # Lowercase Conversion
    msg_lower = punctuationfree.lower()

    # Word Tokenization
    words = nltk.word_tokenize(msg_lower)

    # Removal of stop words
    output = [i for i in words if i not in stopwords_english]

    # Lemmatization
    lemma_text = [wordnet_lemmatizer.lemmatize(x) for x in output]

    # Join back into a string
    clean_text = " ".join(lemma_text)

    return clean_text

In [None]:
sample_text = "Hello! This is a test message, with some stop words and punctuation."
processed_sample_text = preprocess_text(sample_text)
print(processed_sample_text)

In [None]:
sample_test_input = "Claim your free prize now! Click here."
processed_sample_input = preprocess_text(sample_test_input)
sample_input_count = count_vectorizer.transform([processed_sample_input])
prediction = model_nb.predict(sample_input_count)

print(f"Original Text: {sample_test_input}")
print(f"Predicted Class: {'spam' if prediction[0] == 1 else 'ham'}")