# CCS-249 Exercise Unit 2: ELIZA and RegEx NLP

## Exercise 1: Updating ELIZA
Builds a chatbot that recognizes patterns using regex and responds to specific questions. Includes 5 emotional patterns and sarcastic responses for repeated questions.

In [None]:
# 1. Updating ELIZA
import re
import random

def reflect(fragment):
    """Swap pronouns to reflect user input back to them."""
    reflections = {
        "am": "are",
        "was": "were",
        "i": "you",
        "i'd": "you would",
        "i've": "you have",
        "i'll": "you will",
        "my": "your",
        "are": "am",
        "you've": "I have",
        "you'll": "I will",
        "your": "my",
        "yours": "mine",
        "you": "me",
        "me": "you"
    }
    return ' '.join(reflections.get(word.lower(), word) for word in fragment.split())

def eliza_response(user_input):
    """Generate ELIZA response based on pattern matching."""
    patterns = [
        (r"I need (.*)", "Why do you need {0}?"),
        (r"Why don['']t you (.*)", "Do you really think I don't {0}?"),
        (r"I feel (.*)", "Tell me more about feeling {0}."),
        (r"I want to know the reasons why I am feeling depressed all the time\.?",
         "Why do you want to know the reasons for feeling depressed all the time?"),
        (r"I am feeling stressed\.?", "Tell me more about why you're feeling stressed."),
        (r"My feelings towards my crush are invalidated\.?",
         "Why do you think your feelings towards your crush are invalidated?"),
        (r"You (don['']t|do not) understand me\.?",
         "Why do you think I don't understand you?"),
        (r"I (can['']t|cannot) focus on my studies\.?",
         "What is making it difficult for you to focus on your studies?")
    ]
    
    for pattern, response in patterns:
        match = re.match(pattern, user_input, re.IGNORECASE)
        if match and match.groups():
            return response.format(reflect(match.group(1)))
        elif match:
            return response
    return "Tell me more about that."

previous_questions = []
sarcasm = [
    "We've been through this already. Try something new.",
    "Seriously? Again? I thought we moved on.",
    "Oh wow, you're stuck on that? Let's call it a day.",
    "Asking me the same thing twice won't change anything.",
    "What, did you forget I already answered that?",
    "Congratulations, you've discovered the loop function.",
    "I'm not a broken record, even if you're testing me."
]

print("ELIZA: Hello! How can I help you today?")
while True:
    user_input = input("You: ").strip()
    
    if user_input.lower() in ["quit", "exit"]:
        print("ELIZA: Goodbye!")
        break
    
    if not user_input:
        continue
    
    if user_input.lower() in [q.lower() for q in previous_questions]:
        print(f"ELIZA: {random.choice(sarcasm)}\n")
    else:
        response = eliza_response(user_input)
        print(f"ELIZA: {response}\n")
        previous_questions.append(user_input)

## Exercise 2: Implementing RegEx on NLP
Applies regex patterns to extract and process text data.

### Part A: Extract Capitalized Words
Use regex to find all words that start with a capital letter in the text.

In [None]:
# Part A: Extract words starting with uppercase
import re

print("=" * 50)
print("PART A: Extract Capitalized Words")
print("=" * 50)

text = """Alice was beginning to get very tired of sitting by her sister on the bank,
and of having nothing to do. Once or twice she had peeped into the book
her sister was reading, but it had no pictures or conversations in it, "and
what is the use of a book," thought Alice, "without pictures or conversations?"""

pattern = r'\b[A-Z]\w*'
words = re.findall(pattern, text)

print(f"Pattern: {pattern}")
print(f"Capitalized words: {words}\n")

### Part B: Extract and Replace from Literary Text
Read Moby Dick text file and replace the first 10 instances of "Whale/whale" with "leviathan".

In [None]:
# Part B: Extract and replace Whale/whales in Moby Dick
print("=" * 50)
print("PART B: Extract and Replace Whale/Whales")
print("=" * 50)

try:
    with open(r'c:\Users\DELL\Desktop\CCS-249_25-26_Activities\BSCS 3A\KYLA_ELIJAH_RAMIRO\melville-moby_dick.txt', 'r', encoding='utf-8') as f:
        text = f.read()
    
    pattern = r'\b(Whale|Whales|whale|whales)\b'
    matches = re.findall(pattern, text)
    
    print(f"Pattern: {pattern}")
    print(f"Total matches: {len(matches)}")
    print(f"First 10: {matches[:10]}\n")
    
    # Replace first 10 instances
    counter = [0]
    def replace_first_ten(m):
        counter[0] += 1
        return "leviathan" if counter[0] <= 10 else m.group(0)
    
    modified = re.sub(pattern, replace_first_ten, text)
    print(f"First 10 instances replaced with 'leviathan'\n")
    
except FileNotFoundError:
    print("Error: melville-moby_dick.txt not found\n")

### Part C: Extract Character Dialogue from NLTK Corpus
Use NLTK to load the pirates.txt file and extract all lines spoken by Jack Sparrow.

In [None]:
# Part C: Extract Jack Sparrow lines from NLTK Pirates corpus
print("=" * 50)
print("PART C: Extract Jack Sparrow Dialogue")
print("=" * 50)

try:
    import nltk
    from nltk.corpus import webtext
    
    nltk.download('webtext', quiet=True)
    
    text = webtext.raw('pirates.txt')
    pattern = r'JACK SPARROW:\s*(.+?)(?=\n[A-Z\s]+:|$)'
    
    lines = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
    
    print(f"Pattern: {pattern}")
    print(f"Total Jack Sparrow lines: {len(lines)}\n")
    print("First 5 lines:")
    for i, line in enumerate(lines[:5], 1):
        clean = line.strip().replace('\n', ' ')[:80]
        print(f"{i}. {clean}...\n")
        
except ImportError:
    print("Error: NLTK not installed. Run: pip install nltk")

## Exercise 3: Naïve Bayes Implementation
Manual implementation of a Naïve Bayes classifier for spam detection without using external packages/libraries.

### Dataset:
| Document | Class |
|----------|-------|
| Free money now!!! | SPAM |
| Hi mom, how are you? | HAM |
| Lowest price for your meds | SPAM |
| Are we still on for dinner? | HAM |
| Win a free iPhone today | SPAM |
| Let's catch up tomorrow at the office | HAM |
| Meeting at 3 PM tomorrow | HAM |
| Get 50% off, limited time! | SPAM |
| Team meeting in the office | HAM |
| Click here for prizes! | SPAM |
| Can you send the report? | HAM |

In [None]:
# Dataset: List of documents with their corresponding class labels
documents = [
    ("Free money now!!!", "SPAM"),
    ("Hi mom, how are you?", "HAM"),
    ("Lowest price for your meds", "SPAM"),
    ("Are we still on for dinner?", "HAM"),
    ("Win a free iPhone today", "SPAM"),
    ("Let's catch up tomorrow at the office", "HAM"),
    ("Meeting at 3 PM tomorrow", "HAM"),
    ("Get 50% off, limited time!", "SPAM"),
    ("Team meeting in the office", "HAM"),
    ("Click here for prizes!", "SPAM"),
    ("Can you send the report?", "HAM")
]

print("Dataset loaded successfully!")
print(f"Total documents: {len(documents)}")
print(f"\nFirst 3 documents:")
for i, (doc, label) in enumerate(documents[:3], 1):
    print(f"{i}. '{doc}' - {label}")

### Part A: Generate Bag of Words (Word Frequency)
Creates a bag of words representation to count word frequencies across all documents.

In [None]:
def tokenize(text):
    """Tokenize text into lowercase words, removing punctuation."""
    words = []
    current_word = ""
    
    for char in text.lower():
        if char.isalnum():
            current_word += char
        else:
            if current_word:
                words.append(current_word)
                current_word = ""
    
    if current_word:
        words.append(current_word)
    
    return words

def create_bag_of_words(documents):
    """Generate bag of words with word frequencies."""
    bag_of_words = {}
    
    for doc, label in documents:
        tokens = tokenize(doc)
        for token in tokens:
            if token in bag_of_words:
                bag_of_words[token] += 1
            else:
                bag_of_words[token] = 1
    
    return bag_of_words

# Generate bag of words
bag_of_words = create_bag_of_words(documents)

print("="*60)
print("PART A: Bag of Words (Word Frequency)")
print("="*60)
print(f"\nTotal unique words in vocabulary: {len(bag_of_words)}")
print(f"\nWord frequencies (sorted by frequency):")

# Sort by frequency (descending)
sorted_words = sorted(bag_of_words.items(), key=lambda x: x[1], reverse=True)
for word, freq in sorted_words[:15]:
    print(f"  '{word}': {freq}")

print(f"\n... and {len(sorted_words) - 15} more words" if len(sorted_words) > 15 else "")

### Part B: Calculate Prior Probabilities
Calculates the prior probability for each class (HAM and SPAM).

In [None]:
def calculate_prior(documents):
    """Calculate prior probabilities for each class."""
    class_counts = {}
    total_docs = len(documents)
    
    for doc, label in documents:
        if label in class_counts:
            class_counts[label] += 1
        else:
            class_counts[label] = 1
    
    prior = {}
    for label, count in class_counts.items():
        prior[label] = count / total_docs
    
    return prior, class_counts

# Calculate priors
prior, class_counts = calculate_prior(documents)

print("="*60)
print("PART B: Prior Probabilities")
print("="*60)
print(f"\nTotal documents: {len(documents)}")
print(f"\nClass distribution:")
for label, count in class_counts.items():
    print(f"  {label}: {count} documents")

print(f"\nPrior probabilities:")
for label, prob in prior.items():
    print(f"  P({label}) = {count}/{len(documents)} = {prob:.4f} ({prob*100:.2f}%)")

### Part C: Calculate Likelihood of Tokens
Calculates the likelihood of each token in the vocabulary with respect to each class using Laplace smoothing.

In [None]:
def calculate_likelihood(documents, vocabulary):
    """Calculate likelihood of each token given each class with Laplace smoothing."""
    # Count word occurrences per class
    word_count_by_class = {}
    total_words_by_class = {}
    
    # Initialize
    for doc, label in documents:
        if label not in word_count_by_class:
            word_count_by_class[label] = {}
            total_words_by_class[label] = 0
    
    # Count words
    for doc, label in documents:
        tokens = tokenize(doc)
        for token in tokens:
            if token in word_count_by_class[label]:
                word_count_by_class[label][token] += 1
            else:
                word_count_by_class[label][token] = 1
            total_words_by_class[label] += 1
    
    # Calculate likelihood with Laplace smoothing (alpha = 1)
    likelihood = {}
    vocab_size = len(vocabulary)
    
    for label in word_count_by_class:
        likelihood[label] = {}
        for word in vocabulary:
            word_count = word_count_by_class[label].get(word, 0)
            # Laplace smoothing: (count + 1) / (total_words + vocab_size)
            likelihood[label][word] = (word_count + 1) / (total_words_by_class[label] + vocab_size)
    
    return likelihood, word_count_by_class, total_words_by_class

# Calculate likelihood
likelihood, word_count_by_class, total_words_by_class = calculate_likelihood(documents, bag_of_words.keys())

print("="*60)
print("PART C: Likelihood of Tokens")
print("="*60)
print(f"\nVocabulary size: {len(bag_of_words)}")
print(f"\nTotal words per class:")
for label, count in total_words_by_class.items():
    print(f"  {label}: {count} words")

print(f"\nLikelihood with Laplace smoothing (alpha=1):")
print(f"Formula: P(word|class) = (count(word in class) + 1) / (total words in class + vocab_size)")

# Show likelihood for some example words
example_words = ['free', 'money', 'meeting', 'office', 'spam', 'ham']
print(f"\nExample likelihood values:")
for word in example_words:
    if word in bag_of_words:
        print(f"\n  Word: '{word}'")
        for label in likelihood:
            count = word_count_by_class[label].get(word, 0)
            prob = likelihood[label][word]
            print(f"    P('{word}'|{label}) = ({count} + 1) / ({total_words_by_class[label]} + {len(bag_of_words)}) = {prob:.6f}")

# Show top words for each class
print(f"\nTop 5 most frequent words per class:")
for label in word_count_by_class:
    print(f"\n  {label}:")
    sorted_words = sorted(word_count_by_class[label].items(), key=lambda x: x[1], reverse=True)
    for word, count in sorted_words[:5]:
        prob = likelihood[label][word]
        print(f"    '{word}': count={count}, P('{word}'|{label})={prob:.6f}")

### Summary
This implementation demonstrates a manual Naïve Bayes classifier with:
1. **Bag of Words**: Tracks word frequencies across all documents
2. **Prior Probabilities**: P(HAM) and P(SPAM) based on document counts
3. **Likelihood**: P(word|class) for each word in vocabulary using Laplace smoothing

No external libraries were used - all calculations are done manually using basic Python data structures.