In [1]:
# Install and import libraries
import nltk
import pandas as pd
import numpy as np
import re
import tensorflow as tf


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Download required resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [2]:
# Load the dataset
file_path = "/content/spam_sms.csv"  # Replace with your actual file path
data = pd.read_csv(file_path, encoding='latin-1')


In [3]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

# Rename columns for clarity
data.columns = ['Label', 'Message']

First few rows of the dataset:
     v1                                                 v2
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [4]:
# Check for missing values
print("\nChecking for missing values:")
print(data.isnull().sum())

# Drop missing values if any
data.dropna(inplace=True)

# Map labels to binary values
data['Label'] = data['Label'].map({'ham': 0, 'spam': 1})


Checking for missing values:
Label      0
Message    0
dtype: int64


In [5]:
# Clean the text data
def clean_text(text):
    # Remove special characters, numbers, and convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

data['Cleaned_Message'] = data['Message'].apply(clean_text)

In [6]:
# Tokenize the text
data['Tokenized_Message'] = data['Cleaned_Message'].apply(word_tokenize)

# Remove stop words
stop_words = set(stopwords.words('english'))
data['Tokenized_Message'] = data['Tokenized_Message'].apply(
    lambda tokens: [word for word in tokens if word not in stop_words]
)

# Apply stemming
stemmer = PorterStemmer()
data['Stemmed_Message'] = data['Tokenized_Message'].apply(
    lambda tokens: [stemmer.stem(word) for word in tokens]
)

# Apply lemmatization
lemmatizer = WordNetLemmatizer()
data['Lemmatized_Message'] = data['Tokenized_Message'].apply(
    lambda tokens: [lemmatizer.lemmatize(word) for word in tokens]
)

In [7]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data['Lemmatized_Message'], data['Label'], test_size=0.2, random_state=42
)

# Display final processed data
print("\nSample of processed data:")
print(data.head())



Sample of processed data:
   Label                                            Message  \
0      0  Go until jurong point, crazy.. Available only ...   
1      0                      Ok lar... Joking wif u oni...   
2      1  Free entry in 2 a wkly comp to win FA Cup fina...   
3      0  U dun say so early hor... U c already then say...   
4      0  Nah I don't think he goes to usf, he lives aro...   

                                     Cleaned_Message  \
0  go until jurong point crazy available only in ...   
1                            ok lar joking wif u oni   
2  free entry in  a wkly comp to win fa cup final...   
3        u dun say so early hor u c already then say   
4  nah i dont think he goes to usf he lives aroun...   

                                   Tokenized_Message  \
0  [go, jurong, point, crazy, available, bugis, n...   
1                     [ok, lar, joking, wif, u, oni]   
2  [free, entry, wkly, comp, win, fa, cup, final,...   
3      [u, dun, say, early, hor, 

In [8]:
# Convert lemmatized messages back into strings for vectorization
data['Processed_Message'] = data['Lemmatized_Message'].apply(lambda tokens: ' '.join(tokens))

# Apply TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features if needed
tfidf_features = tfidf_vectorizer.fit_transform(data['Processed_Message'])


# Convert the sparse matrices to DataFrames for easier manipulation
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())


# Add the labels back to the data
tfidf_df['Label'] = data['Label'].values


# Save the datasets to CSV files
tfidf_df.to_csv('tfidf_sms_dataset.csv', index=False)


print("TF-IDF datasets saved as 'tfidf_sms_dataset.csv'")


TF-IDF datasets saved as 'tfidf_sms_dataset.csv'


In [9]:

# Load the TF-IDF dataset
tfidf_df = pd.read_csv('tfidf_sms_dataset.csv')

# Separate features and labels
X = tfidf_df.drop(columns=['Label'])
y = tfidf_df['Label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print evaluation metrics
print("Model Performance Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Model Performance Metrics:
Accuracy: 0.9480
Precision: 0.9600
Recall: 0.6400
F1-Score: 0.7680

Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       965
           1       0.96      0.64      0.77       150

    accuracy                           0.95      1115
   macro avg       0.95      0.82      0.87      1115
weighted avg       0.95      0.95      0.94      1115



In [10]:
# Load the TF-IDF dataset
tfidf_df = pd.read_csv('tfidf_sms_dataset.csv')

# Separate features and labels
X = tfidf_df.drop(columns=['Label']).values
y = tfidf_df['Label'].values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build the neural network model
model = Sequential([
    Dense(128, input_dim=X_train.shape[1], activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Sigmoid for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=2)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {accuracy:.4f}")

# Make predictions
y_pred = (model.predict(X_test) > 0.5).astype("int32")

# Evaluate performance
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
112/112 - 5s - 40ms/step - accuracy: 0.8898 - loss: 0.2882 - val_accuracy: 0.9652 - val_loss: 0.1176
Epoch 2/10
112/112 - 1s - 10ms/step - accuracy: 0.9818 - loss: 0.0609 - val_accuracy: 0.9798 - val_loss: 0.0834
Epoch 3/10
112/112 - 2s - 15ms/step - accuracy: 0.9969 - loss: 0.0122 - val_accuracy: 0.9776 - val_loss: 0.0769
Epoch 4/10
112/112 - 2s - 20ms/step - accuracy: 0.9992 - loss: 0.0052 - val_accuracy: 0.9776 - val_loss: 0.0801
Epoch 5/10
112/112 - 2s - 21ms/step - accuracy: 0.9994 - loss: 0.0030 - val_accuracy: 0.9776 - val_loss: 0.0817
Epoch 6/10
112/112 - 1s - 13ms/step - accuracy: 0.9994 - loss: 0.0018 - val_accuracy: 0.9787 - val_loss: 0.0887
Epoch 7/10
112/112 - 3s - 25ms/step - accuracy: 0.9994 - loss: 0.0020 - val_accuracy: 0.9776 - val_loss: 0.0912
Epoch 8/10
112/112 - 1s - 12ms/step - accuracy: 0.9997 - loss: 0.0011 - val_accuracy: 0.9753 - val_loss: 0.0858
Epoch 9/10
112/112 - 2s - 22ms/step - accuracy: 1.0000 - loss: 5.9912e-04 - val_accuracy: 0.9742 - val_l