<a href="https://colab.research.google.com/github/abdullahwarraichh/Machine-Learning-Projects/blob/main/sms_text_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importing necessary libraries
import pandas as pd  # For handling data in tabular format
import tensorflow_datasets as tfds  # For accessing datasets (not used much here)
import numpy as np  # For numerical operations
import matplotlib.pyplot as plt  # For visualizing data (not used here)
from tensorflow.keras.preprocessing.text import one_hot  # For converting words to numbers
from tensorflow.keras.preprocessing.sequence import pad_sequences  # For ensuring equal length of input sequences
from tensorflow.keras.models import Sequential  # For building a sequential neural network
from tensorflow.keras.layers import Flatten, Embedding, Dense  # For building different layers of the neural network
from tensorflow.keras.callbacks import EarlyStopping  # To stop training early if performance doesn't improve

# Step 1: Downloading and setting file paths for training and testing data
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv -O train_data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv -O test_data.tsv

train_data_path = "train_data.tsv"
test_data_path = "test_data.tsv"

# Step 2: Reading the data into pandas DataFrames
column_names = ["label", "text_message"]  # Columns: one for the label (ham/spam), one for the actual message
train_data = pd.read_csv(train_data_path, sep='\t', names=column_names)
test_data = pd.read_csv(test_data_path, sep='\t', names=column_names)

# Step 3: Extracting messages and labels from the data
train_texts = train_data["text_message"].tolist()  # List of all the messages from training data
train_labels = np.array([0 if label == "ham" else 1 for label in train_data['label']])  # 0 for ham, 1 for spam

test_texts = test_data["text_message"].tolist()  # List of all the messages from test data
test_labels = np.array([0 if label == "ham" else 1 for label in test_data['label']])  # 0 for ham, 1 for spam

# Step 4: Creating a dictionary of words (vocabulary) with their counts
vocab_dict = {}
for message in train_texts:
    for word in message.split():  # Split each message into words
        if word not in vocab_dict:
            vocab_dict[word] = 1  # If word not in vocab, add it with count 1
        else:
            vocab_dict[word] += 1  # If word already in vocab, increment its count

# Step 5: Encoding messages into numbers and padding them to have the same length
VOCAB_SIZE = len(vocab_dict)  # Number of unique words (size of the vocabulary)
MAX_MESSAGE_LENGTH = max([len(msg.split()) for msg in train_texts])  # Length of the longest message

# Convert messages to one-hot encoded integers
encoded_train_texts = [one_hot(msg, VOCAB_SIZE) for msg in train_texts]
padded_train_texts = pad_sequences(encoded_train_texts, maxlen=MAX_MESSAGE_LENGTH, padding='post')  # Pad the sequences

encoded_test_texts = [one_hot(msg, VOCAB_SIZE) for msg in test_texts]
padded_test_texts = pad_sequences(encoded_test_texts, maxlen=MAX_MESSAGE_LENGTH, padding='post')  # Pad test sequences

# Step 6: Building the machine learning model using Keras
model = Sequential()  # Initialize the model

# Add an embedding layer that converts words into meaningful vectors
model.add(Embedding(input_dim=VOCAB_SIZE, output_dim=100, input_length=MAX_MESSAGE_LENGTH))
model.add(Flatten())  # Flatten the output into a single vector
model.add(Dense(1, activation='sigmoid'))  # Output layer with a sigmoid function for binary classification (0 or 1)

# Compile the model, specifying optimizer, loss function, and metric to track
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Step 7: Training the model with early stopping
early_stopping_monitor = EarlyStopping(
    monitor='val_accuracy',  # Monitor the validation accuracy
    patience=25,  # Stop if no improvement after 25 epochs
    restore_best_weights=True,  # Restore the weights from the best epoch
    verbose=1  # Output training progress
)

# Fit the model with training data and validate with test data
model.fit(
    padded_train_texts,  # Training data (messages)
    train_labels,  # Training labels (ham/spam)
    validation_data=(padded_test_texts, test_labels),  # Validation data (test messages and labels)
    epochs=1000,  # Maximum number of epochs to train
    callbacks=[early_stopping_monitor],  # Apply early stopping
    verbose=2  # Output training details
)

# Step 8: Function to predict whether a new message is ham or spam
def predict_text_message(message):
    encoded_msg = one_hot(message, VOCAB_SIZE)  # One-hot encode the message
    padded_msg = pad_sequences([encoded_msg], maxlen=MAX_MESSAGE_LENGTH, padding='post')  # Pad the message
    prediction = model.predict(padded_msg)[0][0]  # Get the predicted value (0 to 1)
    prediction_class = "ham" if prediction < 0.5 else "spam"  # Classify based on prediction: <0.5 is ham, >=0.5 is spam
    return [prediction, prediction_class]  # Return the predicted probability and the class (ham/spam)

# Step 9: Function to test the model with a set of sample messages
def test_model():
    # List of test messages
    sample_texts = [
        "how are you doing today",
        "sale today! to stop texts call 98912460324",
        "i dont want to go. can we try it a different day? available sat",
        "our new mobile video service is live. just install on your phone to start watching.",
        "you have won £1000 cash! call to claim your prize.",
        "i'll bring it tomorrow. don't forget the milk.",
        "wow, is your arm alright. that happened to me one time too"
    ]

    # Correct labels for the sample messages
    correct_labels = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]

    success = True  # Track if all predictions are correct

    # Check each message against its correct label
    for text, correct_label in zip(sample_texts, correct_labels):
        prediction = predict_text_message(text)
        if prediction[1] != correct_label:
            success = False  # Mark as false if any prediction is wrong

    if success:
        print("You passed the challenge. Well done!")
    else:
        print("Some predictions were incorrect. Keep trying.")

# Run the test
test_model()


--2024-11-03 22:07:06--  https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.2.33, 172.67.70.149, 104.26.3.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.2.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358233 (350K) [text/tab-separated-values]
Saving to: ‘train_data.tsv’


2024-11-03 22:07:06 (7.07 MB/s) - ‘train_data.tsv’ saved [358233/358233]

--2024-11-03 22:07:06--  https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.2.33, 172.67.70.149, 104.26.3.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.2.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118774 (116K) [text/tab-separated-values]
Saving to: ‘test_data.tsv’


2024-11-03 22:07:06 (5.08 MB/s) - ‘test_data.tsv’ saved [118774/118774]





Epoch 1/1000
131/131 - 9s - 69ms/step - accuracy: 0.9186 - loss: 0.2314 - val_accuracy: 0.9691 - val_loss: 0.0989
Epoch 2/1000
131/131 - 4s - 27ms/step - accuracy: 0.9844 - loss: 0.0567 - val_accuracy: 0.9835 - val_loss: 0.0549
Epoch 3/1000
131/131 - 4s - 30ms/step - accuracy: 0.9916 - loss: 0.0284 - val_accuracy: 0.9856 - val_loss: 0.0452
Epoch 4/1000
131/131 - 3s - 20ms/step - accuracy: 0.9964 - loss: 0.0145 - val_accuracy: 0.9864 - val_loss: 0.0413
Epoch 5/1000
131/131 - 2s - 17ms/step - accuracy: 0.9986 - loss: 0.0082 - val_accuracy: 0.9878 - val_loss: 0.0368
Epoch 6/1000
131/131 - 3s - 26ms/step - accuracy: 0.9998 - loss: 0.0050 - val_accuracy: 0.9864 - val_loss: 0.0420
Epoch 7/1000
131/131 - 4s - 32ms/step - accuracy: 0.9998 - loss: 0.0039 - val_accuracy: 0.9871 - val_loss: 0.0366
Epoch 8/1000
131/131 - 4s - 31ms/step - accuracy: 0.9998 - loss: 0.0028 - val_accuracy: 0.9892 - val_loss: 0.0340
Epoch 9/1000
131/131 - 3s - 25ms/step - accuracy: 0.9998 - loss: 0.0024 - val_accuracy: 