## `NLTK`

**Sentence Casing**

In [1]:
import nltk

In [2]:
# storing a string in a variable
string = 'The quick brown fox jumps over the lazy dog.'

In [3]:
print("String:", string, "\nLower:", string.lower(), "\nUpper:", string.upper())

String: The quick brown fox jumps over the lazy dog. 
Lower: the quick brown fox jumps over the lazy dog. 
Upper: THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG.


**Importing a file and processing it**

In [4]:
# Reading the content of the file
with open("D:/Z/Downloads/nltk_example.txt", "r") as file:
    content = file.read()
    
content[:500] # Displaying the first 500 characters of the content

FileNotFoundError: [Errno 2] No such file or directory: 'D:/Z/Downloads/nltk_example.txt'

**Split each word - 1**

In [None]:
# Using basic Python string method to tokenize the text
basic_tokens = content.split()

basic_tokens[:20]  # Displaying the first 20 tokens

**Frequency of a given word**

In [None]:
from collections import Counter

In [None]:
# Calculating word frequencies
word_frequencies = Counter(basic_tokens)

def getWordFrequency(word):
    """Returns the frequency of the given word in the content."""
    return word_frequencies[word]

In [None]:
# Testing the function with the word "Technological"
getWordFrequency("Technological")

**Identify numbers in the corpus**

In [None]:
import re

In [None]:
def identifyNumbers(text):
    """Identifies and returns all the numbers in the given text."""
    return re.findall(r'\b\d+\b', text)

In [None]:
numbers_in_content = identifyNumbers(content)
numbers_in_content

**Remove the numbers from the corpus**

In [None]:
def removeNumbers(text):
    """Removes all numbers from the given text and returns the cleaned text."""
    return re.sub(r'\b\d+\b', '', text)

In [None]:
cleaned_content = removeNumbers(content)

# Displaying the first 500 characters of the cleaned content
cleaned_content[:500]

**Remove the two or three letter words**

In [None]:
def removeShortWords(text, min_length=4):
    """Removes words of length less than min_length from the given text and returns the cleaned text."""
    return ' '.join([word for word in text.split() if len(word) >= min_length])

In [None]:
cleaned_content_without_short_words = removeShortWords(cleaned_content)

# Displaying the first 500 characters of the content without short words
cleaned_content_without_short_words[:500]

**Stopwords**

In [None]:
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
stop_words = set(stopwords.words('english'))
tokens_ = word_tokenize(string)
result = [i for i in tokens_ if i not in stop_words]

In [None]:
result

**Stemming**

In [None]:
import nltk
from nltk.stem import PorterStemmer
# nltk.download('punkt')

In [None]:
text = "The quick brown fox jumps over the lazy dog."

stemmer = PorterStemmer()
words = nltk.word_tokenize(text)
stemmed_words = [stemmer.stem(word) for word in words]

In [None]:
stemmed_words

**Lemmatization**

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
#nltk.download('wordnet')

In [None]:
text = "The quick brown fox jumped over the lazy dogs."

lemmatizer = WordNetLemmatizer()
words = nltk.word_tokenize(text)
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

In [None]:
lemmatized_words

**Part of Speech**

In [None]:
from nltk import pos_tag
#nltk.download('averaged_perceptron_tagger')

In [None]:
def posTagging(tokens):
    tagged = pos_tag(tokens)
    return tagged

In [None]:
posTagging(basic_tokens)

## `Regular Expression`

**(.) Operator**

In [None]:
# importing the necessasry libraries
import re

In [None]:
# storing a string in a variable
text = 'cat, bat, rat, hat, hello'

# . --> 
pattern = r".at"
matches = re.findall(pattern, text)

In [None]:
print(matches)

**(*) Operator**

In [None]:
# storing a string in a variable
text = 'abc abbc aabbc aabbbbc asdf sdfg'

# * --> 
pattern = r"ab*c"
matches = re.findall(pattern, text)

In [None]:
print(matches)

**(?) Operator**

In [None]:
# storing a string in a variable
text = "color or colour"

# ? --> mmatches 0 or 1 occurence of the preceding character
pattern = r"colou?r"
matches = re.findall(pattern, text)

In [None]:
print(matches)

**(|) Operator**

In [None]:
# storing a string in a variable
text = 'cat or dog'

# | --? logical OR
pattern = r"cat|dog"
matches = re.findall(pattern, text)

In [None]:
print(matches)

**([ ]) Operator**

In [None]:
# storing a string in a variable
text = 'the car is red, the cat is black and the rat ran away'

# [ ] --> character class
pattern = r'[cr]at'
matches = re.findall(pattern, text)

In [None]:
print(matches)

**( () ) Operator**

In [None]:
# storing a string in a variable
text = 'The price is $10,000. Discount is $2000.'

# () --> character class
pattern = r'\$([0-9,]+)'
matches = re.findall(pattern, text)

In [None]:
print(matches)

**( {} ) Operator**

In [None]:
# storing a string in a variable
text = 'aaaa aa a aa aaaaaa'

# () --> character class
pattern = r'a{2,3}'
matches = re.findall(pattern, text)

In [None]:
print(matches)

**(^) Operator**

In [None]:
# storing a string in a variable
text = 'Hello, World!'

# () --> character class
pattern = r'^Hello'
matches = re.search(pattern, text)

In [None]:
if matches:
    print("Match Found:", matches.group())
else:
    print("Match not Found!")

**Numbers Removing**

In [None]:
# storing a string in a variable
input_str = 'Box contains 3 red and 5 white balls, while box B contains 4 red and 2 blue balls'

# substitution function
result = re.sub(r'\d+', '@', input_str)

In [None]:
print(result)

**Punctual Removal**

In [None]:
def removePunctuation(text):
    import string
    # function generated ... be used to map character
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

In [None]:
input_str = "Hey, did you know tomorrow is a holiday? Its great a news right!"
removePunctuation(input_str)

**Removing Whitespace between Sentences**

In [None]:
def removeSpace():
    x

**Split each word - 2**

In [None]:
tokens_ = re.findall(r'\b\w+\b', content)
print(tokens_)

## `TextBlob`

**Part of Speech**

In [None]:
from textblob import TextBlob

In [None]:
text = "The quick brown fox jumps over the lazy dog."
result = TextBlob(text)

In [None]:
result.tags

<hr>