**Tokenization, Cleaning, Stemming, and Lemmatization**

In [None]:
import nltk
import string
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download required NLTK data quietly
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# Initialize tools
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text, fallback_split=True):
    """
    Advanced text preprocessing function.
    Includes tokenization, lowercasing, punctuation removal, stopword removal, stemming, and lemmatization.
    """
    # Tokenization
    try:
        tokens = word_tokenize(text)
    except LookupError:
        print("Punkt tokenizer not found, using simple split() instead.")
        tokens = text.split() if fallback_split else []

    # Lowercasing
    tokens = [word.lower() for word in tokens]

    # Punctuation cleaning
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]

    # Keep only alphabetic and non-empty words
    words = [word for word in stripped if word.isalpha()]

    # Stopword removal
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Stemming
    stemmed = [stemmer.stem(word) for word in words]

    # Lemmatization
    lemmatized = [lemmatizer.lemmatize(word) for word in words]

    # Results
    return {
        'original_tokens': tokens,
        'cleaned_words': words,
        'stemmed': stemmed,
        'lemmatized': lemmatized
    }

# Sample text
text = """NLTK (Natural Language Toolkit) is developed by the University of Pennsylvania and MIT researchers
         including Steven Bird and Edward Loper. It's widely used in New York, London, and Tokyo for
         processing English, Spanish, and Japanese text. Apple and Google have also contributed to its development."""

# Process the text
results = preprocess_text(text)

# Print results
print("✅ Original Tokens:", results['original_tokens'])
print("✅ Cleaned Words:", results['cleaned_words'])
print("✅ Stemmed Words:", results['stemmed'])
print("✅ Lemmatized Words:", results['lemmatized'])


Punkt tokenizer not found, using simple split() instead.
✅ Original Tokens: ['nltk', '(natural', 'language', 'toolkit)', 'is', 'developed', 'by', 'the', 'university', 'of', 'pennsylvania', 'and', 'mit', 'researchers', 'including', 'steven', 'bird', 'and', 'edward', 'loper.', "it's", 'widely', 'used', 'in', 'new', 'york,', 'london,', 'and', 'tokyo', 'for', 'processing', 'english,', 'spanish,', 'and', 'japanese', 'text.', 'apple', 'and', 'google', 'have', 'also', 'contributed', 'to', 'its', 'development.']
✅ Cleaned Words: ['nltk', 'natural', 'language', 'toolkit', 'developed', 'university', 'pennsylvania', 'mit', 'researchers', 'including', 'steven', 'bird', 'edward', 'loper', 'widely', 'used', 'new', 'york', 'london', 'tokyo', 'processing', 'english', 'spanish', 'japanese', 'text', 'apple', 'google', 'also', 'contributed', 'development']
✅ Stemmed Words: ['nltk', 'natur', 'languag', 'toolkit', 'develop', 'univers', 'pennsylvania', 'mit', 'research', 'includ', 'steven', 'bird', 'edward'

**POS Tagging (Part-of-Speech Tagging)**

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [None]:
from nltk import pos_tag

# POS Tagging
pos_tags = pos_tag(results['original_tokens'])

print("✅ POS Tagged Tokens:")
for word, tag in pos_tags:
    print(f"{word} → {tag}")


✅ POS Tagged Tokens:
nltk → JJ
(natural → JJ
language → NN
toolkit) → NN
is → VBZ
developed → VBN
by → IN
the → DT
university → NN
of → IN
pennsylvania → NN
and → CC
mit → NN
researchers → NNS
including → VBG
steven → JJ
bird → NN
and → CC
edward → NN
loper. → NN
it's → NN
widely → RB
used → VBN
in → IN
new → JJ
york, → NN
london, → NN
and → CC
tokyo → NN
for → IN
processing → VBG
english, → JJ
spanish, → NN
and → CC
japanese → JJ
text. → NN
apple → NN
and → CC
google → NN
have → VBP
also → RB
contributed → VBN
to → TO
its → PRP$
development. → NN


**POS Tag Filtering**

**Filtering Only Nouns (NN), Verbs (VB), and Adjectives (JJ)**

In [None]:
# Allowed POS tag prefixes
allowed_tags_prefixes = ('NN', 'VB', 'JJ')

# Filtering
filtered_pos = [(word, tag) for word, tag in pos_tags if tag.startswith(allowed_tags_prefixes)]

# Extract only words
filtered_words = [word for word, tag in filtered_pos]

print("🔍 Filtered (Noun/Verb/Adj) words:")
print(filtered_words)


🔍 Filtered (Noun/Verb/Adj) words:
['nltk', '(natural', 'language', 'toolkit)', 'is', 'developed', 'university', 'pennsylvania', 'mit', 'researchers', 'including', 'steven', 'bird', 'edward', 'loper.', "it's", 'used', 'new', 'york,', 'london,', 'tokyo', 'processing', 'english,', 'spanish,', 'japanese', 'text.', 'apple', 'google', 'have', 'contributed', 'development.']





# **NER:Named Entity Recognition**

Automatically identifies and labels named entities in a text.
Person names — PERSON
ORG — Organizations
GPE — Geopolitical entities (countries, cities)
DATE — Dates
TIME — Times
MONEY — Monetary amounts
PRODUCT — Product names
LAW — Laws/regulations
EVENT — Events

The spaCy library was used.

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

doc = nlp(text)

print("✅ spaCy Named Entities:")
for ent in doc.ents:
    print(f"{ent.text} → {ent.label_}")


✅ spaCy Named Entities:
the University of Pennsylvania → ORG
MIT → ORG
Steven Bird → PERSON
Edward Loper → PERSON
New York → GPE
London → GPE
Tokyo → GPE
English → LANGUAGE
Spanish → NORP
Japanese → NORP
Apple → ORG
Google → ORG




**Chunking (Grammatical Phrases)**

Let's extract the sentence structure: subject, verb phrase, noun phrase, etc.

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

print("✅ Noun Phrases (spaCy):")
for chunk in doc.noun_chunks:
    print(f"NP → {chunk.text}")


✅ Noun Phrases (spaCy):
NP → NLTK
NP → (Natural Language Toolkit
NP → Pennsylvania
NP → MIT
NP → Steven Bird
NP → Edward Loper
NP → It
NP → New York
NP → London
NP → Tokyo
NP → English
NP → Spanish
NP → Japanese text
NP → Apple
NP → Google
NP → its development


# **3. Text Mining / Meaning Extraction Section.**
Here, we extract statistical information from texts.**


**1- TF-IDF stands for "Term Frequency – Inverse Document Frequency."**

This method is used to quantitatively measure how important a word is within a document.
It is especially common in text mining and information extraction.

Why is it used?

To find keywords

To create vectors before text classification

To rank in search engines

Feature extraction in machine learning models


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Join cleaned words
cleaned_text = " ".join(results['cleaned_words'])

# Create single-document list (TF-IDF expects this structure)
documents = [cleaned_text]

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# Get feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Get TF-IDF scores
tfidf_scores = tfidf_matrix.toarray()[0]

# Print top 5 words with highest scores
top_scores = sorted(zip(feature_names, tfidf_scores), key=lambda x: x[1], reverse=True)[:5]

print("🔷 Top 5 Important Words (TF-IDF):")
for word, score in top_scores:
    print(f"{word} → {score:.4f}")


🔷 Top 5 Important Words (TF-IDF):
also → 0.1826
apple → 0.1826
bird → 0.1826
contributed → 0.1826
developed → 0.1826


**2-) N-gram Analysis (Bigram / Trigram)**

We will find the most frequent 2-word (bigram) or 3-word (trigram) word groups.

In [None]:
from nltk.util import ngrams
from collections import Counter

# Get preprocessed words
clean_words = results['cleaned_words']  # tokenized and cleaned words

# Bigram (2-word groups)
bigrams = list(ngrams(clean_words, 2))
bigram_counts = Counter(bigrams)

# Trigram (3-word groups)
trigrams = list(ngrams(clean_words, 3))
trigram_counts = Counter(trigrams)

# Print top 5 most frequent bigrams and trigrams
print("🔹 Top 5 Most Frequent Bigrams:")
for bg, count in bigram_counts.most_common(5):
    print(f"{bg} → {count} times")

print("\n🔹 Top 5 Most Frequent Trigrams:")
for tg, count in trigram_counts.most_common(5):
    print(f"{tg} → {count} times")


🔹 Top 5 Most Frequent Bigrams:
('nltk', 'natural') → 1 times
('natural', 'language') → 1 times
('language', 'toolkit') → 1 times
('toolkit', 'developed') → 1 times
('developed', 'university') → 1 times

🔹 Top 5 Most Frequent Trigrams:
('nltk', 'natural', 'language') → 1 times
('natural', 'language', 'toolkit') → 1 times
('language', 'toolkit', 'developed') → 1 times
('toolkit', 'developed', 'university') → 1 times
('developed', 'university', 'pennsylvania') → 1 times


**3-Word Frequency:**

Topic Detection: The most frequent words indicate the main theme of the text.

✅ Preliminary Analysis: Useful for getting an initial understanding of the dataset.

✅ Feature Extraction: Frequently occurring words can be used as features in machine learning models.

✅ Identifying Unnecessary Words: Helps filter out overly repeated but meaningless words.

In [None]:
from collections import Counter

# Cleaned words
clean_words = results['cleaned_words']

# Calculate word frequency
word_counts = Counter(clean_words)

# Get top 10 most frequent words
most_common_words = word_counts.most_common(10)

print("🔸 Top 10 Most Used Words:")
for word, count in most_common_words:
    print(f"{word} → {count} times")


🔸 Top 10 Most Used Words:
nltk → 1 times
natural → 1 times
language → 1 times
toolkit → 1 times
developed → 1 times
university → 1 times
pennsylvania → 1 times
mit → 1 times
researchers → 1 times
including → 1 times


**4-Concordance:**

Shows the occurrences of a word within the text, along with a few words before and after it.
This way, you can understand:

-In what sense is the word used?

-In which contexts does it appear?

-What are the differences in discourse style, tone, and usage?



In [None]:
from nltk.text import Text

# 1. Use the previously tokenized original words
original_tokens = results['original_tokens']

# 2. Convert to nltk Text object
nltk_text = Text(original_tokens)

# 3. Specify the target word
target_word = 'and'  # change this to any word you want

# 4. Print concordance output
print(f"🔍 Occurrences of the word '{target_word}' in context:")
nltk_text.concordance(target_word, width=80, lines=5)


🔍 Occurrences of the word 'and' in context:
Displaying 5 of 5 matches:
ped by the university of pennsylvania and mit researchers including steven bird
mit researchers including steven bird and edward loper. it's widely used in new
it's widely used in new york, london, and tokyo for processing english, spanish
okyo for processing english, spanish, and japanese text. apple and google have 
sh, spanish, and japanese text. apple and google have also contributed to its d


# **4. Classification and NLP Models**

**1-Sentiment Analysis**

I used NLTK’s built-in sentiment analysis tool called VADER (Valence Aware Dictionary and sEntiment Reasoner).

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

# Download required data (once)
nltk.download('vader_lexicon')

# Example: join cleaned text
clean_text = " ".join(results['cleaned_words'])

# Initialize Sentiment Analyzer
sia = SentimentIntensityAnalyzer()

# Get sentiment scores
sentiment_scores = sia.polarity_scores(clean_text)

print("Sentiment Scores:", sentiment_scores)


Sentiment Scores: {'neg': 0.0, 'neu': 0.921, 'pos': 0.079, 'compound': 0.3612}


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


**2-Text Classification**

We created the training data ourselves..

TF-IDF + Naive Bayes + Classification Model with simple split()

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Tools
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# 🔧 Simple split() preprocessing
def preprocess_text(text):
    tokens = text.lower().split()
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
    tokens = [lemmatizer.lemmatize(stemmer.stem(t)) for t in tokens]
    return " ".join(tokens)

# 📝 Training data
texts = [
    "NLTK (Natural Language Toolkit) is developed by the University of Pennsylvania and MIT researchers.",
    "I love this product, it is amazing and works perfectly.",
    "This is the worst experience I've ever had.",
    "Apple released a new iPhone model.",
    "The football match was thrilling and exciting.",
    "Google is investing in AI technology.",
    "I hate waiting in long queues."
]

labels = [
    "tech",       # technology
    "positive",   # positive sentiment
    "negative",   # negative sentiment
    "tech",
    "sports",
    "tech",
    "negative"
]

# ✅ Clean texts
clean_texts = [preprocess_text(text) for text in texts]

# Define models
models = {
    "Naive Bayes": MultinomialNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "SVM": SVC(probability=True),
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000)
}

# 🔍 Test text
test_text = """NLTK (Natural Language Toolkit) is developed by the University of Pennsylvania and MIT researchers
including Steven Bird and Edward Loper. It's widely used in New York, London, and Tokyo for
processing English, Spanish, and Japanese text. Apple and Google have also contributed to its development."""

test_clean = preprocess_text(test_text)

print(f"\n📄 Test Text:\n{test_text}\n")

# Train and predict with each model
for name, clf in models.items():
    pipeline = make_pipeline(TfidfVectorizer(), clf)
    pipeline.fit(clean_texts, labels)
    prediction = pipeline.predict([test_clean])
    print(f"🔍 {name} Prediction: **{prediction[0]}**")



📄 Test Text:
NLTK (Natural Language Toolkit) is developed by the University of Pennsylvania and MIT researchers
including Steven Bird and Edward Loper. It's widely used in New York, London, and Tokyo for
processing English, Spanish, and Japanese text. Apple and Google have also contributed to its development.

🔍 Naive Bayes Prediction: **tech**
🔍 Decision Tree Prediction: **tech**
🔍 SVM Prediction: **tech**


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


🔍 Random Forest Prediction: **tech**
🔍 Logistic Regression Prediction: **tech**


# **5. Language Modeling and Lexicons**

**1-WordNet Integration: Synonyms, Antonyms, Hierarchical Relationships**

In [None]:
from nltk.corpus import wordnet as wn
import nltk

nltk.download('omw-1.4')  # Required for WordNet
nltk.download('wordnet')

# Example words (from your text)
words_to_check = ['develop', 'technology', 'research', 'language']

def explore_wordnet(word):
    print(f"\n🔍 Word: {word}")

    synsets = wn.synsets(word)
    if not synsets:
        print("❗ Word not found in WordNet.")
        return

    # 🔹 1. Synonyms
    synonyms = set()
    for syn in synsets:
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    print(f"📘 Synonyms: {', '.join(synonyms)}")

    # 🔸 2. Antonyms
    antonyms = set()
    for syn in synsets:
        for lemma in syn.lemmas():
            if lemma.antonyms():
                antonyms.add(lemma.antonyms()[0].name())
    print(f"📕 Antonyms: {', '.join(antonyms) if antonyms else 'Not found.'}")

    # 🔼 3. Hypernyms
    hypernyms = synsets[0].hypernyms()
    if hypernyms:
        print("🔼 Hypernyms:", ', '.join([h.name().split('.')[0] for h in hypernyms]))
    else:
        print("🔼 Hypernyms: Not found.")

    # 🔽 4. Hyponyms
    hyponyms = synsets[0].hyponyms()
    if hyponyms:
        print("🔽 Hyponyms:", ', '.join([h.name().split('.')[0] for h in hyponyms[:5]]))  # first 5
    else:
        print("🔽 Hyponyms: Not found.")

# Run for all words
for word in words_to_check:
    explore_wordnet(word)


[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



🔍 Word: develop
📘 Synonyms: arise, modernise, get, produce, recrudesce, spring_up, grow, originate, train, modernize, prepare, make_grow, educate, rise, germinate, build_up, develop, uprise, explicate, formulate, break, evolve, acquire
📕 Antonyms: Not found.
🔼 Hypernyms: create
🔽 Hyponyms: build

🔍 Word: technology
📘 Synonyms: engineering_science, applied_science, technology, engineering
📕 Antonyms: Not found.
🔼 Hypernyms: profession, application
🔽 Hyponyms: computer_technology, high_technology, aeronautical_engineering, communications_technology, automotive_technology

🔍 Word: research
📘 Synonyms: enquiry, research, search, inquiry, explore
📕 Antonyms: Not found.
🔼 Hypernyms: investigation
🔽 Hyponyms: operations_research, scientific_research, field_work, microscopy, marketing_research

🔍 Word: language
📘 Synonyms: terminology, lyric, linguistic_process, speech_communication, spoken_language, voice_communication, words, nomenclature, speech, oral_communication, language, spoken_commun

**2-ChatGPT:
GRAMMATICAL ANALYSIS — Detecting Structural Patterns**

In [None]:
# We use the prepared filtered_pos list for pattern detection.
# Example target: NN + VB + NN (noun + verb + noun)

print("[NN + VB + NN] Structures:")
for i in range(len(filtered_pos) - 2):
    tag1, tag2, tag3 = filtered_pos[i][1], filtered_pos[i+1][1], filtered_pos[i+2][1]
    if tag1.startswith('NN') and tag2.startswith('VB') and tag3.startswith('NN'):
        w1, w2, w3 = filtered_pos[i][0], filtered_pos[i+1][0], filtered_pos[i+2][0]
        print(f"→ {w1} ({tag1}) → {w2} ({tag2}) → {w3} ({tag3})")

print("\n[JJ + NN] Adjective + Noun Structures:")
for i in range(len(filtered_pos) - 1):
    tag1, tag2 = filtered_pos[i][1], filtered_pos[i+1][1]
    if tag1.startswith('JJ') and tag2.startswith('NN'):
        w1, w2 = filtered_pos[i][0], filtered_pos[i+1][0]
        print(f"→ {w1} ({tag1}) → {w2} ({tag2})")

print("\n[VB + NN] Verb + Noun Structures:")
for i in range(len(filtered_pos) - 1):
    tag1, tag2 = filtered_pos[i][1], filtered_pos[i+1][1]
    if tag1.startswith('VB') and tag2.startswith('NN'):
        w1, w2 = filtered_pos[i][0], filtered_pos[i+1][0]
        print(f"→ {w1} ({tag1}) → {w2} ({tag2})")


[NN + VB + NN] Structures:

[JJ + NN] Adjective + Noun Structures:
→ (natural (JJ) → language (NN)
→ steven (JJ) → bird (NN)
→ new (JJ) → york, (NN)
→ english, (JJ) → spanish, (NN)
→ japanese (JJ) → text. (NN)

[VB + NN] Verb + Noun Structures:
→ developed (VBN) → university (NN)
→ contributed (VBN) → development. (NN)


**Pattern-based Text Generation — Generating Sentences with the Pattern JJ + NN + VB + NN**

In [None]:
import random



# 1️⃣ Filter words by POS tags
adjectives = [word for word, tag in pos_tags if tag.startswith("JJ")]  # Adjectives (e.g., beautiful, large)
nouns = [word for word, tag in pos_tags if tag.startswith("NN")]       # Nouns (e.g., book, computer)
verbs = [word for word, tag in pos_tags if tag.startswith("VB")]       # Verbs (e.g., run, develop)

# 2️⃣ Simple fallback to avoid empty lists
if not adjectives: adjectives = ["great"]
if not nouns: nouns = ["thing"]
if not verbs: verbs = ["does"]

# 3️⃣ Sentence generation: create a set number of examples
generated_sentences = []
for _ in range(5):  # Generate 5 example sentences
    adj = random.choice(adjectives)
    noun1 = random.choice(nouns)
    verb = random.choice(verbs)
    noun2 = random.choice(nouns)

    # Simple sentence pattern: The [adj] [noun1] [verb]s the [noun2].
    sentence = f"The {adj} {noun1} {verb}s the {noun2}."
    generated_sentences.append(sentence)

# 4️⃣ Print results
print("\n📘 Generated Pattern-based Sentences:")
for s in generated_sentences:
    print("→", s)



📘 Generated Pattern-based Sentences:
→ The steven london, useds the bird.
→ The english, toolkit) processings the loper..
→ The steven development. haves the tokyo.
→ The japanese google developeds the google.
→ The japanese google iss the pennsylvania.
