In [None]:
import nltk

First, you need to download the Brown Corpus data if you haven't already. This command will open an NLTK downloader GUI, where you can select 'brown' under 'Corpora' and click 'Download'. Alternatively, you can download it directly.

In [None]:
try:
    nltk.data.find('corpora/brown')
except LookupError:
    print("Brown Corpus not found. Downloading...")
    nltk.download('brown')
    print("Brown Corpus downloaded successfully.")
else:
    print("Brown Corpus is already available.")

Brown Corpus not found. Downloading...
Brown Corpus downloaded successfully.


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


Once downloaded, you can load the corpus and start exploring it. Here are some basic ways to access its contents:

In [None]:
from nltk.corpus import brown

all_words = brown.words()
print(f"Total words in Brown Corpus: {len(all_words)}")
print(f"First 20 words: {all_words[:20]}\n")

all_sents = brown.sents()
print(f"Total sentences in Brown Corpus: {len(all_sents)}")
print(f"First sentence: {' '.join(all_sents[0])}\n")

print(f"Brown Corpus categories: {brown.categories()}\n")

news_words = brown.words(categories='news')
print(f"First 20 words from 'news' category: {news_words[:20]}\n")

humor_sents = brown.sents(categories='humor')
print(f"First sentence from 'humor' category: {' '.join(humor_sents[0])}\n")

news_tagged_words = brown.tagged_words(categories='news')
print(f"First 10 tagged words from 'news' category: {news_tagged_words[:10]}")

Total words in Brown Corpus: 1161192
First 20 words: ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that']

Total sentences in Brown Corpus: 57340
First sentence: The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place .

Brown Corpus categories: ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']

First 20 words from 'news' category: ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that']

First sentence from 'humor' category: It was among these that Hinkle identified a photograph of Barco ! !

Fir

In [None]:
# Step 2: Explore categories in the Brown Corpus
categories = brown.categories()
print("Categories in Brown Corpus:\n", categories)
# Step 3: Examine sample words from different categories
print("\nSample words from different categories:\n")
for category in categories[:5]:   # showing first 5 categories
    words = brown.words(categories=category)
    print(f"Category: {category}")
    print("Sample words:", words[:10])
    print("-" * 50)
print(len(brown.categories()))

vocab=set(brown.words())
print(len(vocab))

# Step 4: Count number of words in each category
print("\nWord count in each category:\n")
for category in categories:
    word_count = len(brown.words(categories=category))
    print(f"{category} : {word_count}")# Step 5: Select a specific category
selected_category = "news"
print(f"\nSelected Category: {selected_category}")# Step 6: Create vocabulary (unique words) for the selected category
vocabulary = set(brown.words(categories=selected_category))

print(f"Vocabulary size of '{selected_category}' category:", len(vocabulary))

# Display sample vocabulary words
print("\nSample vocabulary words:")
print(list(vocabulary)[:20])

Categories in Brown Corpus:
 ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']

Sample words from different categories:

Category: adventure
Sample words: ['Dan', 'Morgan', 'told', 'himself', 'he', 'would', 'forget', 'Ann', 'Turner', '.']
--------------------------------------------------
Category: belles_lettres
Sample words: ['Northern', 'liberals', 'are', 'the', 'chief', 'supporters', 'of', 'civil', 'rights', 'and']
--------------------------------------------------
Category: editorial
Sample words: ['Assembly', 'session', 'brought', 'much', 'good', 'The', 'General', 'Assembly', ',', 'which']
--------------------------------------------------
Category: fiction
Sample words: ['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back', 'to', 'school', '.', 'His']
--------------------------------------------------
Category: government
Sample words: ['The', 'Offic

In [1]:
# Program 1: Basic Text Processing and Vocabulary Generation

import nltk
import json
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# Download required resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Added to fix the LookupError


# Step 1: Create Corpus
# -----------------------------
text = input("Enter a text to create corpus:\n")

# Convert text to lowercase
text = text.lower()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Enter a text to create corpus:
Hello world 


In [5]:
# Step 2: Tokenization
# -----------------------------
tokens = word_tokenize(text)

# Step 3: Remove punctuation
# -----------------------------
tokens = [word for word in tokens if word not in string.punctuation]



### Step 4: Stopword Removal

This step removes common words (stopwords) that generally do not carry significant meaning from the tokenized list. NLTK's English stopwords list is used for this purpose.

In [10]:
from nltk.corpus import stopwords
import json

stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]

print(f"Original tokens after punctuation removal: {tokens}")
print(f"Tokens after stopword removal: {filtered_tokens}")

# Step 5: Generate Vocabulary
vocabulary = set(filtered_tokens)

print(f"Vocabulary size: {len(vocabulary)}")
print(f"Sample vocabulary words: {list(vocabulary)[:10]}")

# Step 6: Save Vocabulary to JSON
# -----------------------------
with open("vocabulary.json", "w") as file:
    json.dump(list(vocabulary), file) # Convert set to list for JSON serialization

print("\nVocabulary saved to vocabulary.json")

# Step 7: Load Vocabulary from JSON
# -----------------------------
with open("vocabulary.json", "r") as file:
    loaded_vocabulary = json.load(file)

print("\nLoaded Vocabulary from JSON:")
print(loaded_vocabulary)

Original tokens after punctuation removal: ['hello', 'world']
Tokens after stopword removal: ['hello', 'world']
Vocabulary size: 2
Sample vocabulary words: ['hello', 'world']

Vocabulary saved to vocabulary.json

Loaded Vocabulary from JSON:
['hello', 'world']


In [17]:
# Program 2: Exploring Tokenization Techniques using NLTK

import nltk
import re
from nltk.tokenize import (
    sent_tokenize,
    word_tokenize,
    WhitespaceTokenizer,
    RegexpTokenizer,
    TreebankWordTokenizer,
    TweetTokenizer
)

# Download required resources
nltk.download('punkt')
# Sample Text
text = "Hello! NLP is amazing. Let's explore tokenization techniques."

tweet_text = "Learning NLP ðŸ˜„ #AI #NLP @OpenAI https://openai.com"

# 1. Sentence Tokenization
# -----------------------------
print("\nSentence Tokenization:")
sentences = sent_tokenize(text)
print(sentences)


# 3. Whitespace Tokenization
# -----------------------------
print("\nWhitespace Tokenization:")
whitespace_tokenizer = WhitespaceTokenizer()
print(whitespace_tokenizer.tokenize(text))

# 4. Regex Tokenization
# -----------------------------
print("\nRegex Tokenization (Words Only):")
regex_tokenizer = RegexpTokenizer(r'\w+')
print(regex_tokenizer.tokenize(text))




Sentence Tokenization:
['Hello!', 'NLP is amazing.', "Let's explore tokenization techniques."]

Whitespace Tokenization:
['Hello!', 'NLP', 'is', 'amazing.', "Let's", 'explore', 'tokenization', 'techniques.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
# 5. Treebank Tokenizer
# -----------------------------
print("\nTreebank Tokenization:")
treebank_tokenizer = TreebankWordTokenizer()
print(treebank_tokenizer.tokenize(text))


Treebank Tokenization:
['Hello', '!', 'NLP', 'is', 'amazing.', 'Let', "'s", 'explore', 'tokenization', 'techniques', '.']


In [20]:
# 6. Tweet Tokenizer
# -----------------------------
print("\nTweet Tokenization:")
tweet_tokenizer = TweetTokenizer()
print(tweet_tokenizer.tokenize(tweet_text))


Tweet Tokenization:
['Learning', 'NLP', 'ðŸ˜„', '#AI', '#NLP', '@OpenAI', 'https://openai.com']
