In [None]:
# Import spaCy and load the language library
import spacy
nlp = spacy.load('en_core_web_sm')

# Create a Doc object
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

# Print each token separately
for token in doc:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN dobj
startup VERB advcl
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [None]:
nlp.pipeline


[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7e0d6e29ae10>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7e0d6e29ba10>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7e0e4c7cb3e0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7e0d6dfee050>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7e0d6e02d3d0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7e0e4c746c70>)]

In [None]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [None]:
doc2 = nlp(u"Tesla isn't   looking into startups anymore.")

for token in doc2:
    print(token.text, token.pos_, token.dep_)


Tesla PROPN nsubj
is AUX aux
n't PART neg
   SPACE dep
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [None]:
doc2

Tesla isn't   looking into startups anymore.

In [None]:
doc2[0]

Tesla

In [None]:
type(doc2)

spacy.tokens.doc.Doc

In [None]:
doc2[0].pos_

'PROPN'

In [None]:
doc2[0].dep_

'nsubj'

In [None]:
spacy.explain('PROPN')

'proper noun'

In [None]:
spacy.explain('nsubj')

'nominal subject'

In [None]:
# Lemmas (the base form of the word):
print(doc2[4].text)
print(doc2[4].lemma_)

looking
look


In [None]:
# Simple Parts-of-Speech & Detailed Tags:
print(doc2[4].pos_)
print(doc2[4].tag_ + ' / ' + spacy.explain(doc2[4].tag_))

VERB
VBG / verb, gerund or present participle


In [None]:
# Word Shapes:
print(doc2[0].text+': '+doc2[0].shape_)
print(doc[5].text+' : '+doc[5].shape_)

Tesla: Xxxxx
U.S. : X.X.


In [None]:
# Boolean Values:
print(doc2[0].is_alpha)
print(doc2[0].is_stop)

True
False


In [None]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')
life_quote = doc3[16:30]
print(life_quote)
type(life_quote)

"Life is what happens to us while we are making other plans"


spacy.tokens.span.Span

In [None]:
doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')
for sent in doc4.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [None]:
doc4[6].is_sent_start

True

NLP

In [None]:
# Install required packages (run this cell if packages are not already installed)
import subprocess
import sys

def install_package(package):
    """Install a package using pip"""
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"✓ Successfully installed {package}")
    except subprocess.CalledProcessError:
        print(f"✗ Failed to install {package}")

# Core NLP packages
packages = [
    "nltk==3.8.1",
    "spacy==3.7.2",
    "scikit-learn==1.3.2",
    "transformers==4.35.2",
    "datasets==2.14.6",
    "torch==2.1.1",
    "pandas==2.0.3",
    "matplotlib==3.7.2",
    "seaborn==0.12.2",
    "wordcloud==1.9.2",
    "textblob==0.17.1"
]

# Uncomment the next lines to install packages
# for package in packages:
#     install_package(package)

print("Package installation section complete!")
print("Note: Uncomment the installation loop above if you need to install packages.")


Package installation section complete!
Note: Uncomment the installation loop above if you need to install packages.


In [None]:
# Verify imports and download NLTK data
import nltk
import spacy
import sklearn
import transformers
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import warnings
warnings.filterwarnings('ignore')

print("✓ All core packages imported successfully!")

# Download required NLTK data
nltk_downloads = [
    'punkt',
    'stopwords',
    'vader_lexicon',
    'wordnet',
    'averaged_perceptron_tagger',
    'omw-1.4'
]

print("\nDownloading NLTK data...")
for data in nltk_downloads:
    try:
        nltk.download(data, quiet=True)
        print(f"✓ Downloaded {data}")
    except:
        print(f"✗ Failed to download {data}")

print("\n🎉 Environment setup complete!")


✓ All core packages imported successfully!

Downloading NLTK data...
✓ Downloaded punkt
✓ Downloaded stopwords
✓ Downloaded vader_lexicon
✓ Downloaded wordnet
✓ Downloaded averaged_perceptron_tagger
✓ Downloaded omw-1.4

🎉 Environment setup complete!


In [None]:
# Let's start with a simple example to demonstrate basic NLP concepts

# Sample text data
sample_texts = [
    "I love this product! It's amazing and works perfectly.",
    "This is the worst purchase I've ever made. Terrible quality!",
    "The product is okay, nothing special but gets the job done.",
    "Absolutely fantastic! Would recommend to everyone.",
    "Not bad, could be better but decent for the price."
]

print("Sample Texts:")
print("=" * 50)
for i, text in enumerate(sample_texts, 1):
    print(f"{i}. {text}")

# Basic text analysis
print(f"\nBasic Statistics:")
print(f"Number of texts: {len(sample_texts)}")
print(f"Total characters: {sum(len(text) for text in sample_texts)}")
print(f"Average text length: {sum(len(text) for text in sample_texts) / len(sample_texts):.1f} characters")

# Word frequency analysis
all_words = []
for text in sample_texts:
    # Simple tokenization (split by spaces and remove punctuation)
    words = re.findall(r'\b\w+\b', text.lower())
    all_words.extend(words)

word_freq = Counter(all_words)
print(f"\nTop 10 Most Common Words:")
for word, count in word_freq.most_common(10):
    print(f"{word}: {count}")

# Basic sentiment indicators
positive_words = ['love', 'amazing', 'perfectly', 'fantastic', 'recommend']
negative_words = ['worst', 'terrible', 'bad']

print(f"\nBasic Sentiment Analysis:")
for i, text in enumerate(sample_texts, 1):
    text_lower = text.lower()
    pos_count = sum(1 for word in positive_words if word in text_lower)
    neg_count = sum(1 for word in negative_words if word in text_lower)

    if pos_count > neg_count:
        sentiment = "Positive"
    elif neg_count > pos_count:
        sentiment = "Negative"
    else:
        sentiment = "Neutral"

    print(f"Text {i}: {sentiment} (pos: {pos_count}, neg: {neg_count})")


Sample Texts:
1. I love this product! It's amazing and works perfectly.
2. This is the worst purchase I've ever made. Terrible quality!
3. The product is okay, nothing special but gets the job done.
4. Absolutely fantastic! Would recommend to everyone.
5. Not bad, could be better but decent for the price.

Basic Statistics:
Number of texts: 5
Total characters: 273
Average text length: 54.6 characters

Top 10 Most Common Words:
the: 4
i: 2
this: 2
product: 2
is: 2
but: 2
love: 1
it: 1
s: 1
amazing: 1

Basic Sentiment Analysis:
Text 1: Positive (pos: 3, neg: 0)
Text 2: Negative (pos: 0, neg: 2)
Text 3: Neutral (pos: 0, neg: 0)
Text 4: Positive (pos: 2, neg: 0)
Text 5: Negative (pos: 0, neg: 1)


In [None]:
# Create sample datasets for the course
import os

# Create data directory
os.makedirs('data', exist_ok=True)

# 1. Sample SMS Spam Dataset
sms_data = [
    ("ham", "Hey, are we still on for lunch today?"),
    ("spam", "URGENT! You've won $1000! Click here now!"),
    ("ham", "Can you pick up milk on your way home?"),
    ("spam", "FREE iPhone! Limited time offer! Call now!"),
    ("ham", "Meeting moved to 3pm tomorrow"),
    ("spam", "Congratulations! You've been selected for a special offer!"),
    ("ham", "Thanks for the birthday wishes!"),
    ("spam", "SALE ALERT: 90% off everything! Don't miss out!"),
    ("ham", "Running late, be there in 10 minutes"),
    ("spam", "You owe $500 in taxes. Pay immediately or face legal action!")
]

sms_df = pd.DataFrame(sms_data, columns=['label', 'message'])
sms_df.to_csv('data/sms_spam_sample.csv', index=False)

print("SMS Spam Dataset:")
print(sms_df)
print(f"\nDataset shape: {sms_df.shape}")
print(f"Label distribution:\n{sms_df['label'].value_counts()}")

# 2. Sample Movie Reviews Dataset
movie_reviews = [
    ("positive", "This movie was absolutely fantastic! Great acting and storyline."),
    ("negative", "Boring and predictable. Waste of time and money."),
    ("positive", "Brilliant cinematography and outstanding performances."),
    ("negative", "Poor script and terrible direction. Very disappointed."),
    ("positive", "Highly recommend! One of the best films this year."),
    ("negative", "Confusing plot and weak character development."),
    ("positive", "Amazing visual effects and compelling story."),
    ("negative", "Overrated and underwhelming. Expected much more."),
    ("positive", "Perfect blend of comedy and drama. Loved every minute!"),
    ("negative", "Slow paced and lacks substance. Not worth watching.")
]

reviews_df = pd.DataFrame(movie_reviews, columns=['sentiment', 'review'])
reviews_df.to_csv('data/movie_reviews_sample.csv', index=False)

print(f"\nMovie Reviews Dataset:")
print(reviews_df)
print(f"\nDataset shape: {reviews_df.shape}")
print(f"Sentiment distribution:\n{reviews_df['sentiment'].value_counts()}")

# 3. Sample News Headlines Dataset
news_headlines = [
    "Apple announces new iPhone with revolutionary camera technology",
    "Stock market reaches all-time high amid economic recovery",
    "Scientists discover new species in Amazon rainforest",
    "Local restaurant wins prestigious culinary award",
    "Tech company Microsoft invests in renewable energy projects",
    "Weather forecast predicts heavy rainfall this weekend",
    "University researchers develop breakthrough medical treatment",
    "Amazon expands delivery services to rural areas",
    "New study reveals benefits of regular exercise",
    "Government announces new environmental protection policies"
]

news_df = pd.DataFrame({'headline': news_headlines})
news_df.to_csv('data/news_headlines_sample.csv', index=False)

print(f"\nNews Headlines Dataset:")
print(news_df)
print(f"\nDataset shape: {news_df.shape}")

print(f"\n✅ Sample datasets created and saved to 'data/' directory!")


SMS Spam Dataset:
  label                                            message
0   ham              Hey, are we still on for lunch today?
1  spam          URGENT! You've won $1000! Click here now!
2   ham             Can you pick up milk on your way home?
3  spam         FREE iPhone! Limited time offer! Call now!
4   ham                      Meeting moved to 3pm tomorrow
5  spam  Congratulations! You've been selected for a sp...
6   ham                    Thanks for the birthday wishes!
7  spam    SALE ALERT: 90% off everything! Don't miss out!
8   ham               Running late, be there in 10 minutes
9  spam  You owe $500 in taxes. Pay immediately or face...

Dataset shape: (10, 2)
Label distribution:
label
ham     5
spam    5
Name: count, dtype: int64

Movie Reviews Dataset:
  sentiment                                             review
0  positive  This movie was absolutely fantastic! Great act...
1  negative   Boring and predictable. Waste of time and money.
2  positive  Brilliant c

In [None]:
# Exercise 1: Text Statistics and Basic Analysis
#
# Task: Write a function that takes a text string and returns:
# 1. Number of words
# 2. Number of sentences (assume sentences end with '.', '!', or '?')
# 3. Average word length
# 4. Most frequent word
# 5. Number of unique words

def analyze_text(text):
    """
    Analyze basic statistics of a text string.

    Args:
        text (str): Input text to analyze

    Returns:
        dict: Dictionary containing text statistics
    """
    # TODO: Implement this function
    # Hint: Use string methods, regular expressions, and Counter

    # Your code here
    pass

# Test your function with this sample text
sample_text = """
Natural Language Processing is a fascinating field of artificial intelligence.
It combines computational linguistics with machine learning and deep learning.
NLP enables computers to understand, interpret, and generate human language in a valuable way!
"""

# Uncomment and run when you've implemented the function
# result = analyze_text(sample_text)
# print("Text Analysis Results:")
# for key, value in result.items():
#     print(f"{key}: {value}")

# Solution (run this cell to see the solution)
def analyze_text_solution(text):
    """Complete solution for text analysis function"""
    import re
    from collections import Counter

    # Clean text and split into words
    words = re.findall(r'\b\w+\b', text.lower())

    # Count sentences
    sentences = re.findall(r'[.!?]+', text)

    # Calculate statistics
    num_words = len(words)
    num_sentences = len(sentences)
    avg_word_length = sum(len(word) for word in words) / num_words if num_words > 0 else 0

    # Find most frequent word
    word_freq = Counter(words)
    most_frequent = word_freq.most_common(1)[0] if words else ("", 0)

    # Count unique words
    unique_words = len(set(words))

    return {
        "word_count": num_words,
        "sentence_count": num_sentences,
        "average_word_length": round(avg_word_length, 2),
        "most_frequent_word": f"{most_frequent[0]} ({most_frequent[1]} times)",
        "unique_words": unique_words
    }

# Test the solution
result = analyze_text_solution(sample_text)
print("📊 Text Analysis Results:")
print("=" * 30)
for key, value in result.items():
    print(f"{key.replace('_', ' ').title()}: {value}")

# Challenge: Try your function on the movie reviews dataset!
print(f"\n🎬 Analyzing movie reviews...")
for i, review in enumerate(reviews_df['review'].head(3), 1):
    print(f"\nReview {i}:")
    stats = analyze_text_solution(review)
    print(f"Words: {stats['word_count']}, Unique: {stats['unique_words']}, Avg length: {stats['average_word_length']}")


📊 Text Analysis Results:
Word Count: 34
Sentence Count: 3
Average Word Length: 6.29
Most Frequent Word: language (2 times)
Unique Words: 30

🎬 Analyzing movie reviews...

Review 1:
Words: 9, Unique: 9, Avg length: 6.0

Review 2:
Words: 8, Unique: 7, Avg length: 4.88

Review 3:
Words: 5, Unique: 5, Avg length: 9.8
