In [None]:
# NER Modeling with spaCy and NLTK

## Step 1: Setup and Installation
!pip install nltk spacy matplotlib pandas
!python3 -m spacy download en_core_web_sm


In [2]:
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import pandas as pd


In [14]:
# Load spaCy Model
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')
nlp = spacy.load("en_core_web_sm")
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
## Step 2: Named Entity Recognition with spaCy
sample_text = "Barack Obama was born in Hawaii. He was the 44th President of the United States. Apple Inc. is a technology company headquartered in Cupertino."

print("\nProcessing Text with spaCy:")
doc = nlp(sample_text)

print("\nExtracted Named Entities:")
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")

from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)



Processing Text with spaCy:

Extracted Named Entities:
Barack Obama (PERSON)
Hawaii (GPE)
44th (ORDINAL)
the United States (GPE)
Apple Inc. (ORG)
Cupertino (GPE)


In [15]:
## Step 3: NER with NLTK
# NLTK does not have a built-in NER model but provides a pre-trained model
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.tree import Tree

# Tokenize and Tag
nltk_tokens = word_tokenize(sample_text)
pos_tags = pos_tag(nltk_tokens)

# Named Entity Chunking
print("\nNamed Entity Chunking with NLTK:")
ne_tree = ne_chunk(pos_tags)
for subtree in ne_tree:
    if isinstance(subtree, Tree):
        entity_name = " ".join([token for token, pos in subtree.leaves()])
        entity_type = subtree.label()
        print(f"{entity_name} ({entity_type})")


Named Entity Chunking with NLTK:
Barack (PERSON)
Obama (PERSON)
Hawaii (GPE)
United States (GPE)
Apple Inc. (PERSON)
Cupertino (GPE)


In [None]:
## Step 4: Custom NER Training with spaCy
# Task 1: Prepare training data and train a custom NER model
# Enhanced Training Function for Custom NER Model
def train_custom_ner_model(train_data):
    from spacy.training.example import Example
    from spacy.util import minibatch

    ner_model = spacy.blank("en")  # Create a blank model
    ner = ner_model.add_pipe("ner", last=True)  # Add NER pipe
    '''
    Insert your code here
    # Add entity labels

    # Start training

    '''
    return ner_model

TRAIN_DATA = [
    ("Google is a tech company in Mountain View.", {"entities": [(0, 6, "ORG"), (29, 42, "GPE")]}),
    ("Elon Musk is the CEO of SpaceX.", {"entities": [(0, 9, "PERSON"), (26, 32, "ORG")]}),
    ("The Eiffel Tower is in Paris.", {"entities": [(4, 17, "LOC"), (23, 28, "GPE")]}),
    ("Tesla's headquarters is in Palo Alto.", {"entities": [(0, 5, "ORG"), (27, 36, "GPE")]}),
    ("Amazon is hiring engineers in Seattle.", {"entities": [(0, 6, "ORG"), (33, 40, "GPE")]}),
]

print("\nTraining Custom NER Model:")
custom_ner_model = train_custom_ner_model(TRAIN_DATA)

custom_ner_model.to_disk("custom_ner_model")
print("Custom NER model saved.")

loaded_model = spacy.load("custom_ner_model")

def test_custom_ner_model(model, test_texts):
    results = {}
    for text in test_texts:
        doc = model(text)
        results[text] = [(ent.text, ent.label_) for ent in doc.ents]
    return results

test_sentences = [
    "Tesla is building new factories in Texas.",
    "Elon Musk is visiting Berlin.",
    "Amazon's revenue grew in Seattle.",
    "The Eiffel Tower is a popular tourist spot.",
]

print("\nTesting Custom NER Model:")
test_results = test_custom_ner_model(loaded_model, test_sentences)
for sentence, entities in test_results.items():
    print(f"Sentence: {sentence}")
    print(f"Extracted Entities: {entities}")


In [None]:
## Step 5: Text Preprocessing for Sentiment Analysis
# Task 2: Write functions for preprocessing textby lemmatizing and removing stopwords
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    doc = nlp(text)
    '''Insert your code here'''
    tokens = []
    return " ".join(tokens)

text_data = [
    "I love programming in Python!",
    "The weather today is terrible.",
    "I had an amazing time at the concert last night!",
    "The movie was not good at all.",
]

preprocessed_texts = [preprocess_text(text) for text in text_data]
print("\nPreprocessed Texts:")
print(preprocessed_texts)

In [None]:
## Step 6: Word Frequency Analysis
# Task 3: Analyze word frequency and plot results
def analyze_word_frequency(texts):
    '''Complete this function'''
    return word_freq

word_freq = analyze_word_frequency(preprocessed_texts)
print("\nWord Frequency Analysis:")
print(word_freq.head())

plt.figure(figsize=(10, 5))
word_freq.head(10).plot(kind="bar", color="skyblue")
plt.title("Top 10 Frequent Words")
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.show()

In [None]:
## Step 7: Preparing for Twitter Sentiment Analysis
# Task 4: Write a function to clean tweets by removing mentions, links, special characters, lowercase and strips
import re

def clean_tweet(tweet):
    '''Complete this function'''
    return tweet

sample_tweets = [
    "@user I love the new design! Check it out: https://example.com",
    "This is the worst product ever. Total waste of money! #fail",
    "Great job by the team on the latest release! #innovation",
]

cleaned_tweets = [clean_tweet(tweet) for tweet in sample_tweets]
print("\nCleaned Tweets:")
print(cleaned_tweets)


In [None]:
# Step 8: Extract and analyze bigrams (pairs of words) and trigrams (triplets of words) from the cleaned tweets to find common patterns or phrases.
from wordcloud import WordCloud

def generate_word_cloud(texts, title):
    combined_text = " ".join(texts)
    wordcloud = WordCloud(width=800, height=400, background_color="white").generate(combined_text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.title(title)
    plt.axis("off")
    plt.show()

generate_word_cloud(cleaned_tweets, "Word Cloud for Cleaned Tweets")


In [None]:
#step 9
# Task 6: Use a basic scoring function based on the presence of positive and negative words and print whether cleaned tweets are positive or negative
positive_words = {"love", "great", "amazing", "awesome", "good"}
negative_words = {"worst", "bad", "terrible", "fail", "waste"}

def simple_sentiment_score(tweet):
    '''Complete the function'''
    return

# Assign sentiment scores to cleaned tweets
sentiment_scores = [simple_sentiment_score(tweet) for tweet in cleaned_tweets]
print("\nSentiment Scores:")
print(sentiment_scores)
