In [1]:
pip install nltk



In [2]:
pip install textblob



In [13]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')  # For lemmatization with WordNet
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [4]:
# Create the file
with open('file.txt', 'w') as f:
    f.write("Thiss is a samplle textt corpus for NLP pre processing. It containss spellling errorrs and runn-onn sentencess. The weather is nicce todayy, isnt it? We went to the park but it rainned heavilly. Machinne learning is funn but challengging.")

# Verify (run in next cell)
with open('file.txt', 'r') as f:
    print(f.read())

Thiss is a samplle textt corpus for NLP pre processing. It containss spellling errorrs and runn-onn sentencess. The weather is nicce todayy, isnt it? We went to the park but it rainned heavilly. Machinne learning is funn but challengging.


In [5]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from textblob import TextBlob  # For spelling correction
import re  # For basic cleaning if needed

# Initialize stemmer and lemmatizer
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [6]:
# Load the text corpus
with open('file.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read().strip()

print("Raw text corpus:")
print(raw_text)
print("\n" + "="*50)

Raw text corpus:
Thiss is a samplle textt corpus for NLP pre processing. It containss spellling errorrs and runn-onn sentencess. The weather is nicce todayy, isnt it? We went to the park but it rainned heavilly. Machinne learning is funn but challengging.



In [12]:
# Tokenize the raw text
tokens = word_tokenize(raw_text.lower())  # Convert to lowercase for consistency

print("First 30 tokens:")
print(tokens[:30])
print("\n" + "="*50)

First 30 tokens:
['thiss', 'is', 'a', 'samplle', 'textt', 'corpus', 'for', 'nlp', 'pre', 'processing', '.', 'it', 'containss', 'spellling', 'errorrs', 'and', 'runn-onn', 'sentencess', '.', 'the', 'weather', 'is', 'nicce', 'todayy', ',', 'isnt', 'it', '?', 'we', 'went']



In [10]:
# Correct spelling for each token
corrected_tokens = []
for token in tokens:
    blob = TextBlob(token)
    corrected_token = str(blob.correct())
    corrected_tokens.append(corrected_token)

# Print first 10 corrected tokens
print("First 10 corrected tokens:")
print(corrected_tokens[:10])

# Reconstruct corrected text corpus (join tokens with spaces, add basic punctuation back if needed)
corrected_text = ' '.join(corrected_tokens)
print("\nCorrected text corpus:")
print(corrected_text)
print("\n" + "="*50)

First 10 corrected tokens:
['this', 'is', 'a', 'sample', 'text', 'corpus', 'for', 'nap', 'pre', 'processing']

Corrected text corpus:
this is a sample text corpus for nap pre processing . it contains spelling errors and run-on sentences . the weather is nice today , isn it ? we went to the park but it raised heavily . machine learning is funny but challenging .



In [14]:
# Apply POS tags to corrected tokens
pos_tags = pos_tag(corrected_tokens)

print("POS tags for corrected tokens:")
for token, pos in pos_tags:
    print(f"{token}: {pos}")
print("\n" + "="*50)

POS tags for corrected tokens:
this: DT
is: VBZ
a: DT
sample: JJ
text: NN
corpus: NN
for: IN
nap: JJ
pre: NN
processing: NN
.: .
it: PRP
contains: VBZ
spelling: VBG
errors: NNS
and: CC
run-on: JJ
sentences: NNS
.: .
the: DT
weather: NN
is: VBZ
nice: JJ
today: NN
,: ,
isn: VB
it: PRP
?: .
we: PRP
went: VBD
to: TO
the: DT
park: NN
but: CC
it: PRP
raised: VBD
heavily: RB
.: .
machine: NN
learning: NN
is: VBZ
funny: JJ
but: CC
challenging: VBG
.: .



In [15]:
# Remove stop words
filtered_tokens = [token for token in corrected_tokens if token not in stop_words and len(token) > 2]

print("First 20 tokens after removing stop words:")
print(filtered_tokens[:20])
print("\n" + "="*50)

First 20 tokens after removing stop words:
['sample', 'text', 'corpus', 'nap', 'pre', 'processing', 'contains', 'spelling', 'errors', 'run-on', 'sentences', 'weather', 'nice', 'today', 'went', 'park', 'raised', 'heavily', 'machine', 'learning']



In [16]:
# Stemming
stemmed_tokens = [ps.stem(token) for token in corrected_tokens]

# Lemmatization (use POS for better accuracy, but simple version here)
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in corrected_tokens]

print("First 20 stemmed tokens:")
print(stemmed_tokens[:20])

print("\nFirst 20 lemmatized tokens:")
print(lemmatized_tokens[:20])
print("\n" + "="*50)

First 20 stemmed tokens:
['thi', 'is', 'a', 'sampl', 'text', 'corpu', 'for', 'nap', 'pre', 'process', '.', 'it', 'contain', 'spell', 'error', 'and', 'run-on', 'sentenc', '.', 'the']

First 20 lemmatized tokens:
['this', 'is', 'a', 'sample', 'text', 'corpus', 'for', 'nap', 'pre', 'processing', '.', 'it', 'contains', 'spelling', 'error', 'and', 'run-on', 'sentence', '.', 'the']



In [17]:
# Detect sentence boundaries on raw (or corrected) text
sentences = sent_tokenize(raw_text)
num_sentences = len(sentences)

print(f"Total number of sentences: {num_sentences}")
print("\nSample sentences:")
for i, sent in enumerate(sentences[:3]):  # Print first 3
    print(f"{i+1}: {sent}")
print("\n" + "="*50)

Total number of sentences: 5

Sample sentences:
1: Thiss is a samplle textt corpus for NLP pre processing.
2: It containss spellling errorrs and runn-onn sentencess.
3: The weather is nicce todayy, isnt it?

