## Section 1: Environment Setup and Package Installation

Before we begin our NLP journey, let's ensure all required packages are installed and working correctly.

In [None]:
# Install required packages (run this cell if packages are not already installed)
import subprocess
import sys

def install_package(package):
    """Install a package using pip"""
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"✓ Successfully installed {package}")
    except subprocess.CalledProcessError:
        print(f"✗ Failed to install {package}")

# Core NLP packages
packages = [
    "nltk==3.8.1",
    "spacy==3.7.2", 
    "scikit-learn==1.3.2",
    "transformers==4.35.2",
    "datasets==2.14.6",
    "torch==2.1.1",
    "pandas==2.0.3",
    "matplotlib==3.7.2",
    "seaborn==0.12.2",
    "wordcloud==1.9.2",
    "textblob==0.17.1"
]

# Uncomment the next lines to install packages
# for package in packages:
#     install_package(package)

print("Package installation section complete!")
print("Note: Uncomment the installation loop above if you need to install packages.")

In [None]:
# Verify imports and download NLTK data
import nltk
import spacy
import sklearn
import transformers
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import warnings
warnings.filterwarnings('ignore')

print("✓ All core packages imported successfully!")

# Download required NLTK data
nltk_downloads = [
    'punkt',
    'stopwords', 
    'vader_lexicon',
    'wordnet',
    'averaged_perceptron_tagger',
    'omw-1.4'
]

print("\nDownloading NLTK data...")
for data in nltk_downloads:
    try:
        nltk.download(data, quiet=True)
        print(f"✓ Downloaded {data}")
    except:
        print(f"✗ Failed to download {data}")

print("\n🎉 Environment setup complete!")

## Section 2: Introduction to Natural Language Processing

### What is NLP?
Natural Language Processing (NLP) is a branch of artificial intelligence that focuses on the interaction between computers and human language. It enables machines to:

- **Understand** human language (reading comprehension)
- **Generate** human-like text (text generation)
- **Translate** between languages
- **Extract insights** from text data
- **Classify** and **analyze** text content


In [None]:
# Let's start with a simple example to demonstrate basic NLP concepts

# Sample text data
sample_texts = [
    "I love this product! It's amazing and works perfectly.",
    "This is the worst purchase I've ever made. Terrible quality!",
    "The product is okay, nothing special but gets the job done.",
    "Absolutely fantastic! Would recommend to everyone.",
    "Not bad, could be better but decent for the price."
]

print("Sample Texts:")
print("=" * 50)
for i, text in enumerate(sample_texts, 1):
    print(f"{i}. {text}")

# Basic text analysis
print(f"\nBasic Statistics:")
print(f"Number of texts: {len(sample_texts)}")
print(f"Total characters: {sum(len(text) for text in sample_texts)}")
print(f"Average text length: {sum(len(text) for text in sample_texts) / len(sample_texts):.1f} characters")

# Word frequency analysis
all_words = []
for text in sample_texts:
    # Simple tokenization (split by spaces and remove punctuation)
    words = re.findall(r'\b\w+\b', text.lower())
    all_words.extend(words)

word_freq = Counter(all_words)
print(f"\nTop 10 Most Common Words:")
for word, count in word_freq.most_common(10):
    print(f"{word}: {count}")

# Basic sentiment indicators
positive_words = ['love', 'amazing', 'perfectly', 'fantastic', 'recommend']
negative_words = ['worst', 'terrible', 'bad']

print(f"\nBasic Sentiment Analysis:")
for i, text in enumerate(sample_texts, 1):
    text_lower = text.lower()
    pos_count = sum(1 for word in positive_words if word in text_lower)
    neg_count = sum(1 for word in negative_words if word in text_lower)
    
    if pos_count > neg_count:
        sentiment = "Positive"
    elif neg_count > pos_count:
        sentiment = "Negative"
    else:
        sentiment = "Neutral"
    
    print(f"Text {i}: {sentiment} (pos: {pos_count}, neg: {neg_count})")

## Section 3: Loading and Inspecting Datasets

Throughout this course, we'll work with several real-world datasets. Let's load and inspect some of the key datasets we'll be using.

In [None]:
# Create sample datasets for the course
import os

# Create data directory
os.makedirs('data', exist_ok=True)

# 1. Sample SMS Spam Dataset
sms_data = [
    ("ham", "Hey, are we still on for lunch today?"),
    ("spam", "URGENT! You've won $1000! Click here now!"),
    ("ham", "Can you pick up milk on your way home?"),
    ("spam", "FREE iPhone! Limited time offer! Call now!"),
    ("ham", "Meeting moved to 3pm tomorrow"),
    ("spam", "Congratulations! You've been selected for a special offer!"),
    ("ham", "Thanks for the birthday wishes!"),
    ("spam", "SALE ALERT: 90% off everything! Don't miss out!"),
    ("ham", "Running late, be there in 10 minutes"),
    ("spam", "You owe $500 in taxes. Pay immediately or face legal action!")
]

sms_df = pd.DataFrame(sms_data, columns=['label', 'message'])
sms_df.to_csv('data/sms_spam_sample.csv', index=False)

print("SMS Spam Dataset:")
print(sms_df)
print(f"\nDataset shape: {sms_df.shape}")
print(f"Label distribution:\n{sms_df['label'].value_counts()}")

# 2. Sample Movie Reviews Dataset  
movie_reviews = [
    ("positive", "This movie was absolutely fantastic! Great acting and storyline."),
    ("negative", "Boring and predictable. Waste of time and money."),
    ("positive", "Brilliant cinematography and outstanding performances."),
    ("negative", "Poor script and terrible direction. Very disappointed."),
    ("positive", "Highly recommend! One of the best films this year."),
    ("negative", "Confusing plot and weak character development."),
    ("positive", "Amazing visual effects and compelling story."),
    ("negative", "Overrated and underwhelming. Expected much more."),
    ("positive", "Perfect blend of comedy and drama. Loved every minute!"),
    ("negative", "Slow paced and lacks substance. Not worth watching.")
]

reviews_df = pd.DataFrame(movie_reviews, columns=['sentiment', 'review'])
reviews_df.to_csv('data/movie_reviews_sample.csv', index=False)

print(f"\nMovie Reviews Dataset:")
print(reviews_df)
print(f"\nDataset shape: {reviews_df.shape}")
print(f"Sentiment distribution:\n{reviews_df['sentiment'].value_counts()}")

# 3. Sample News Headlines Dataset
news_headlines = [
    "Apple announces new iPhone with revolutionary camera technology",
    "Stock market reaches all-time high amid economic recovery",
    "Scientists discover new species in Amazon rainforest",
    "Local restaurant wins prestigious culinary award",
    "Tech company Microsoft invests in renewable energy projects",
    "Weather forecast predicts heavy rainfall this weekend",
    "University researchers develop breakthrough medical treatment",
    "Amazon expands delivery services to rural areas",
    "New study reveals benefits of regular exercise",
    "Government announces new environmental protection policies"
]

news_df = pd.DataFrame({'headline': news_headlines})
news_df.to_csv('data/news_headlines_sample.csv', index=False)

print(f"\nNews Headlines Dataset:")
print(news_df)
print(f"\nDataset shape: {news_df.shape}")

print(f"\n✅ Sample datasets created and saved to 'data/' directory!")

## Section 4: Quick Exercise - Your First NLP Task

Let's practice with a simple exercise to get you comfortable with basic text processing.

In [None]:
# Exercise 1: Text Statistics and Basic Analysis
# 
# Task: Write a function that takes a text string and returns:
# 1. Number of words
# 2. Number of sentences (assume sentences end with '.', '!', or '?')
# 3. Average word length
# 4. Most frequent word
# 5. Number of unique words

def analyze_text(text):
    """
    Analyze basic statistics of a text string.
    
    Args:
        text (str): Input text to analyze
    
    Returns:
        dict: Dictionary containing text statistics
    """
    # TODO: Implement this function
    # Hint: Use string methods, regular expressions, and Counter
    
    # Your code here
    pass

# Test your function with this sample text
sample_text = """
Natural Language Processing is a fascinating field of artificial intelligence. 
It combines computational linguistics with machine learning and deep learning.
NLP enables computers to understand, interpret, and generate human language in a valuable way!
"""

# Uncomment and run when you've implemented the function
# result = analyze_text(sample_text)
# print("Text Analysis Results:")
# for key, value in result.items():
#     print(f"{key}: {value}")

# Solution (run this cell to see the solution)
def analyze_text_solution(text):
    """Complete solution for text analysis function"""
    import re
    from collections import Counter
    
    # Clean text and split into words
    words = re.findall(r'\b\w+\b', text.lower())
    
    # Count sentences
    sentences = re.findall(r'[.!?]+', text)
    
    # Calculate statistics
    num_words = len(words)
    num_sentences = len(sentences)
    avg_word_length = sum(len(word) for word in words) / num_words if num_words > 0 else 0
    
    # Find most frequent word
    word_freq = Counter(words)
    most_frequent = word_freq.most_common(1)[0] if words else ("", 0)
    
    # Count unique words
    unique_words = len(set(words))
    
    return {
        "word_count": num_words,
        "sentence_count": num_sentences,
        "average_word_length": round(avg_word_length, 2),
        "most_frequent_word": f"{most_frequent[0]} ({most_frequent[1]} times)",
        "unique_words": unique_words
    }

# Test the solution
result = analyze_text_solution(sample_text)
print("📊 Text Analysis Results:")
print("=" * 30)
for key, value in result.items():
    print(f"{key.replace('_', ' ').title()}: {value}")

# Challenge: Try your function on the movie reviews dataset!
print(f"\n🎬 Analyzing movie reviews...")
for i, review in enumerate(reviews_df['review'].head(3), 1):
    print(f"\nReview {i}:")
    stats = analyze_text_solution(review)
    print(f"Words: {stats['word_count']}, Unique: {stats['unique_words']}, Avg length: {stats['average_word_length']}")