<a href="https://colab.research.google.com/github/Zeinab13Rm/NLP_Project/blob/main/NLP_Lab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
    !git clone https://github.com/Zeinab13Rm/NLP_Project

fatal: destination path 'NLP_Project' already exists and is not an empty directory.


# SMS Spam Detection using NLP

## Setup

Importing Libraries

In [22]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

Importing Data

In [23]:
df = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['label', 'message'])
df.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [24]:
text_messages = df[df.columns[1]]

## Data Preprocessing

Expanding Contractions

In [25]:
# Dictionary of common contractions and their expansions
CONTRACTIONS_DICT = {
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "won't": "will not",
    "can't": "cannot",
    "couldn't": "could not",
    "shouldn't": "should not",
    "wouldn't": "would not",
    "isn't": "is not",
    "aren't": "are not",
    "wasn't": "was not",
    "weren't": "were not",
    "haven't": "have not",
    "hasn't": "has not",
    "hadn't": "had not",
    "i'm": "i am",
    "i've": "i have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "i'll": "i will",
    "you'll": "you will",
    "he'll": "he will",
    "she'll": "she will",
    "it'll": "it will",
    "we'll": "we will",
    "they'll": "they will",
    "let's": "let us",
}

def expand_contractions(text):
    """
    Expand common contractions in text
    """
    # Create a regex pattern that matches any of the contractions
    contractions_pattern = re.compile('({})'.format('|'.join(CONTRACTIONS_DICT.keys())),
                                      flags=re.IGNORECASE|re.DOTALL)

    def expand_match(contraction):
        match = contraction.group(0)
        # Handle case sensitivity
        expanded = CONTRACTIONS_DICT.get(match.lower())
        if expanded is None:
            expanded = CONTRACTIONS_DICT.get(match)
        if expanded:
            # Preserve the original case as much as possible
            if match[0].isupper():
                expanded = expanded[0].upper() + expanded[1:]
            return expanded
        return match

    return contractions_pattern.sub(expand_match, text)


Using regular expressions to replace email adresses , urls , numbers , etc.

In [26]:

def clean_text(text):
    """
    Cleans a single text string by applying a series of re.sub rules.
    """
    # First expand contractions
    text = expand_contractions(text)

    # replacing email addresses with 'emailaddr'
    text = re.sub(r'\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}', 'emailaddr', text)

    # replacing urls with 'webaddress'
    text = re.sub(r'http[s]?\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?', 'webaddress', text)

    # replacing money symbols with 'moneysymb'
    text = re.sub(r'£|\$', 'moneysymb', text)

    # replacing 10+ digit phone numbers with 'phonenum' (improved for general numbers)
    text = re.sub(r'\d{10,}', 'phonenum', text)

    # replacing normal numbers with 'num'
    text = re.sub(r'\d+(\.\d+)?', 'num', text)

    # remove punctuation
    text = re.sub(r'[^\w\d\s]', ' ', text)

    # replace whitespace between terms with a single space
    text = re.sub(r'\s+', ' ', text)

    # remove leading and trailing whitespace
    text = re.sub(r'^\s+|\s+$', '', text)

    return text

# Apply the function to every message
processed = text_messages.apply(clean_text)


Changing words to lowercase

In [27]:
# change all words to lower case
processed = processed.str.lower()
# Add the processed text as a new column to the DataFrame to see the comparison
df['processed_message'] = processed

In [28]:
# Print the first 5 original and processed messages to see the changes
print(df[['message', 'processed_message']].head(5))

                                             message  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                   processed_message  
0  go until jurong point crazy available only in ...  
1                            ok lar joking wif u oni  
2  free entry in num a wkly comp to win fa cup fi...  
3        u dun say so early hor u c already then say  
4  nah i do not think he goes to usf he lives aro...  


Tokenization

In [29]:
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
print(all_words)



Removing Stop words

In [30]:
# Get English stop words
stop_words = set(stopwords.words('english'))

# Remove stop words from all_words
filtered_words = [word for word in all_words if word.lower() not in stop_words]

print(f"Original number of words: {len(all_words)}")
print(f"Number of words after removing stop words: {len(filtered_words)}")
print("\nSample of filtered words:")
print(filtered_words[:20])
tokens = filtered_words

Original number of words: 90312
Number of words after removing stop words: 53351

Sample of filtered words:
['go', 'jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amore', 'wat', 'ok', 'lar', 'joking', 'wif']


## Stemming vs. Lemmatization

### Stemmers

In [31]:
porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer("english")

sample_words = ['studies', 'studying', 'running', 'organization', 'traditionally', 'better', 'calls', 'calling']

print(f"{'Word':<15} | {'Porter':<15} | {'Lancaster':<15} | {'Snowball':<15}")
print("-" * 65)

for word in sample_words:
    p_stem = porter.stem(word)
    l_stem = lancaster.stem(word)
    s_stem = snowball.stem(word)
    print(f"{word:<15} | {p_stem:<15} | {l_stem:<15} | {s_stem:<15}")


Word            | Porter          | Lancaster       | Snowball       
-----------------------------------------------------------------
studies         | studi           | study           | studi          
studying        | studi           | study           | studi          
running         | run             | run             | run            
organization    | organ           | org             | organ          
traditionally   | tradit          | tradit          | tradit         
better          | better          | bet             | better         
calls           | call            | cal             | call           
calling         | call            | cal             | call           


### Lemmatization

In [32]:
lemmatizer = WordNetLemmatizer()

print(f"{'Word':<15} | {'Lemma':<15}")
print("-" * 35)
for word in sample_words:
    print(f"{word:<15} | {lemmatizer.lemmatize(word):<15}")

Word            | Lemma          
-----------------------------------
studies         | study          
studying        | studying       
running         | running        
organization    | organization   
traditionally   | traditionally  
better          | better         
calls           | call           
calling         | calling        


**Conclusion:** For this spam classification project, stemming is the more appropriate and efficient choice.

**Justification:**

1. **Goal of the Task:** Our goal is to classify spam. A machine learning model (like NB or a SVM) doesn't need to understand the semantic meaning of a sentence. It just needs to identify patterns of spam features.
For a model, the feature studi (from stemming) is just as useful as study (from lemmatization) for grouping studies, studying, and study. The fact that studi isn't a real English word is irrelevant to the model.

2. **Speed and Performance:**
Lemmatization is computationally expensive. To work correctly (e.g., to turn better into good or running into run), it requires Part-of-Speech (POS) tagging. This means we must run a second model (a POS-tagger) on every single word just to prepare it for the first model.
Stemming is way faster as it uses a simple set of rules to chop off ends. This is far more efficient for large datasets.

Given that stemming provides 90% of the benefit (feature reduction) for a fraction of the computational cost, it is the chosen method for a task like spam filtering.

###Chosen Method

**Conclusion**: after testing different stemmers on the sample data , the SnowballStemmer was selected.

**Justification**:

* **Lancaster** is Overly Aggressive: this stemmer was found to be too aggressive. It reduces words so much that they can become ambiguous or lose their meaning (e.g., calls $\rightarrow$ cal, organization $\rightarrow$ org). This "over-stemming" could mistakenly group unrelated words, which would harm our model's accuracy.

* **Porter** is a Good Baseline: The original Porter stemmer is a classic and much safer than Lancaster. However, it's older and has known problems.

* **Snowball** is the ***Best*** Balance: The SnowballStemmer is a more modern, refined version of the Porter algorithm. It is slightly more aggressive and consistent than Porter (handling words Porter misses) but avoids the destructive over-stemming of Lancaster. It is widely considered the best stemmer and provides the ideal balance of speed and effective feature reduction for this task.

In [33]:
stemmed_tokens = [snowball.stem(t) for t in tokens]

print(f"Original token count: {len(tokens)}")
print(f"Stemmed token count:  {len(stemmed_tokens)}")
print("\nSample of stemmed tokens:")
print(stemmed_tokens[:20])

Original token count: 53351
Stemmed token count:  53351

Sample of stemmed tokens:
['go', 'jurong', 'point', 'crazi', 'avail', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amor', 'wat', 'ok', 'lar', 'joke', 'wif']


## N-grams

In [34]:
all_message_bigrams = []
all_message_trigrams = []

#here we recreated the whole pipeline after choosing method
for message in processed:
    # 1. Tokenize
    words = word_tokenize(message)
    # 2. Remove Stopwords
    filtered = [w for w in words if w not in stop_words]
    # 3. Stem
    stemmed = [snowball.stem(w) for w in filtered]

    # 4. Generate N-grams
    # We use list() to convert the ngrams generator to a list
    bigrams = list(ngrams(stemmed, 2))
    trigrams = list(ngrams(stemmed, 3))

    all_message_bigrams.extend(bigrams)
    all_message_trigrams.extend(trigrams)
print(f"Total number of Bigrams found: {len(all_message_bigrams)}")
print(f"Total number of Trigrams found: {len(all_message_trigrams)}")

print("\nSample Bigrams:")
print(all_message_bigrams[:15])

print("\nSample Trigrams:")
print(all_message_trigrams[:15])


Total number of Bigrams found: 47786
Total number of Trigrams found: 42323

Sample Bigrams:
[('go', 'jurong'), ('jurong', 'point'), ('point', 'crazi'), ('crazi', 'avail'), ('avail', 'bugi'), ('bugi', 'n'), ('n', 'great'), ('great', 'world'), ('world', 'la'), ('la', 'e'), ('e', 'buffet'), ('buffet', 'cine'), ('cine', 'got'), ('got', 'amor'), ('amor', 'wat')]

Sample Trigrams:
[('go', 'jurong', 'point'), ('jurong', 'point', 'crazi'), ('point', 'crazi', 'avail'), ('crazi', 'avail', 'bugi'), ('avail', 'bugi', 'n'), ('bugi', 'n', 'great'), ('n', 'great', 'world'), ('great', 'world', 'la'), ('world', 'la', 'e'), ('la', 'e', 'buffet'), ('e', 'buffet', 'cine'), ('buffet', 'cine', 'got'), ('cine', 'got', 'amor'), ('got', 'amor', 'wat'), ('ok', 'lar', 'joke')]
