In [11]:
#22. Write a program to read a text file with at least 30 sentences and 200 words
#and perform the following tasks in the given sequence.
#a. Text cleaning by removing punctuation/special characters, numbers
#and extra white spaces. Use regular expression for the same.
#b. Convert text to lowercase
#c. Tokenization
#d. Remove stop words
#e. Correct misspelled words 
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob

# Download required NLTK data
nltk.download('punkt_tab')
nltk.download('stopwords')

# Load and read text file
with open("D:/College/AIL/sample_text.txt", "r", encoding='utf-8') as f:
    text = f.read()

# Step 1: Text Cleaning (remove punctuation, special characters, numbers, and extra white spaces)
text = re.sub(r'[^a-zA-Z\s]', '', text)            # Remove punctuation & special chars
text = re.sub(r'\s+', ' ', text).strip()           # Remove extra spaces

# Step 2: Convert text to lowercase
text = text.lower()

# Step 3: Tokenization
tokens = word_tokenize(text)

# Step 4: Remove stop words
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]

# Step 5: Correct misspelled words using TextBlob
corrected_tokens = [str(TextBlob(word).correct()) for word in tokens]

# Display results
print("\nCleaned Tokens:\n", tokens[:20])
print("\nCorrected Tokens:\n", corrected_tokens[:20])

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Cleaned Tokens:
 ['natural', 'language', 'processing', 'branch', 'artificial', 'intelligence', 'helps', 'computers', 'understund', 'human', 'language', 'nlp', 'widelyy', 'used', 'applications', 'chatbots', 'sentiment', 'analysis', 'search', 'engines']

Corrected Tokens:
 ['natural', 'language', 'processing', 'branch', 'artificial', 'intelligence', 'helps', 'computers', 'understand', 'human', 'language', 'nap', 'widely', 'used', 'applications', 'whatnots', 'sentiment', 'analysis', 'search', 'engines']


In [12]:
#23. Write a program to read a text file with at least 30 sentences and 200 words
#and perform the following tasks in the given sequence.
#a. Text cleaning by removing punctuation/special characters, numbers
#and extra white spaces. Use regular expression for the same.
#b. Convert text to lowercase
#c. Stemming and Lemmatization
#d. Create a list of 3 consecutive words after lemmatization
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load and read text file
with open("D:/College/AIL/sample_text.txt", "r", encoding='utf-8') as f:
    text = f.read()

# Step 1: Text Cleaning (remove punctuation, special characters, numbers, and extra white spaces)
text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation & special characters
text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

# Step 2: Convert text to lowercase
text = text.lower()

# Step 3: Tokenization
tokens = word_tokenize(text)

# Step 4: Stemming (using PorterStemmer)
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in tokens]

# Step 5: Lemmatization (using WordNetLemmatizer)
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

# Step 6: Create a list of 3 consecutive words after lemmatization
consecutive_words = [lemmatized_tokens[i:i+3] for i in range(len(lemmatized_tokens)-2)]

# Display results
print("\nLemmatized Tokens:\n", lemmatized_tokens[:20])
print("\nConsecutive Words (3 consecutive words):\n", consecutive_words[:10])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...



Lemmatized Tokens:
 ['natural', 'language', 'processing', 'is', 'a', 'branch', 'of', 'artificial', 'intelligence', 'it', 'help', 'computer', 'understund', 'human', 'language', 'nlp', 'is', 'widelyy', 'used', 'in']

Consecutive Words (3 consecutive words):
 [['natural', 'language', 'processing'], ['language', 'processing', 'is'], ['processing', 'is', 'a'], ['is', 'a', 'branch'], ['a', 'branch', 'of'], ['branch', 'of', 'artificial'], ['of', 'artificial', 'intelligence'], ['artificial', 'intelligence', 'it'], ['intelligence', 'it', 'help'], ['it', 'help', 'computer']]
