In [1]:
import nltk
import string
import re

### Text to Uppercase

In [3]:
def text_uppercase(text):
    return text.upper()

input_str = "Hey, did you know that the summer break is coming? Amazing right!! It's only 5 days more !!"
text_uppercase(input_str)

"HEY, DID YOU KNOW THAT THE SUMMER BREAK IS COMING? AMAZING RIGHT!! IT'S ONLY 5 DAYS MORE !!"

### Text to Lowercase

In [5]:
def text_lowercase(text):
    return text.lower()

input_str = "HEY, DID YOU KNOW THAT THE SUMMER BREAK IS COMING? AMAZING RIGHT!! IT'S ONLY 5 DAYS MORE !!"
text_lowercase(input_str)

"hey, did you know that the summer break is coming? amazing right!! it's only 5 days more !!"

### Remove Numbers

In [3]:
def remove_numbers(text):
    result = re.sub(r'\d+', '', text)
    return result

input_str = "There are 3 balls in this bag, and 12 in the other."
remove_numbers(input_str)

'There are  balls in this bag, and  in the other.'

### Converting Figures into Words

In [4]:
import inflect
p = inflect.engine()

def convert_number(text):
    temp_str = text.split()
    new_string = []
    
    for word in temp_str:
        if word.isdigit():
            temp = p.number_to_words(word)
            new_string.append(temp)
            
        else:
            new_string.append(word)
                
    temp_str = ' '.join(new_string)
    return temp_str
    
input_str = 'There are 3 balls in this bag, and 12 in the other one.'
convert_number(input_str)

'There are three balls in this bag, and twelve in the other one.'

### Remove Punctuations

In [5]:
def remove_punctuations(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

input_str = "Hey, did you know that the summer break is coming? Amazing right!! It's only 5 more days !!"
remove_punctuations(input_str)

'Hey did you know that the summer break is coming Amazing right Its only 5 more days '

### Remove WhiteSpaces

In [6]:
def remove_whitespace(text):
    return " ".join(text.split())
    
input_str = "  we don't need  the given questions"
remove_whitespace(input_str)

"we don't need the given questions"

### Remove Default Stopwords

In [7]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text

example_text = "This is a sample sentence and we are going to remove the stopwords from this."
remove_stopwords(example_text)

['This', 'sample', 'sentence', 'going', 'remove', 'stopwords', '.']

### Stemming

In [8]:
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def stem_words(text):
    word_tokens = word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    return stems

text = 'data science uses scientific methods algorithm and many typpes of processes'
stem_words(text)

['data',
 'scienc',
 'use',
 'scientif',
 'method',
 'algorithm',
 'and',
 'mani',
 'typp',
 'of',
 'process']

### Lemmatization

In [12]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word, pos = 'v') for word in word_tokens]
    return lemmas

text = 'data science uses scientific methods algorithm and many typpes of processes'
lemmatize_words(text)

[nltk_data] Downloading package wordnet to C:\Users\HP/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\HP/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


['data',
 'science',
 'use',
 'scientific',
 'methods',
 'algorithm',
 'and',
 'many',
 'typpes',
 'of',
 'process']

### Part of Speech Tagging

In [28]:
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize
from nltk import pos_tag

def pos_tagging(text):
    word_tokens = word_tokenize(text)
    return pos_tag(word_tokens)

pos_tagging("You really scared me right there")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\HP/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('You', 'PRP'),
 ('really', 'RB'),
 ('scared', 'VBD'),
 ('me', 'PRP'),
 ('right', 'RB'),
 ('there', 'EX')]

In [14]:
nltk.download('tagsets')

nltk.help.upenn_tagset('NN')

[nltk_data] Downloading package tagsets to C:\Users\HP/nltk_data...


NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...


[nltk_data]   Unzipping help\tagsets.zip.


### Chunking

In [17]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag

def chunking(text, grammar):
    word_tokens = word_tokenize(text)
    
    word_pos = pos_tag(word_tokens)
    chunkParser = nltk.RegexpParser(grammar)
    tree = chunkParser.parse(word_pos)
    
    for subtree in tree.subtrees():
        print(subtree)
    tree.draw()
    
sentence = 'the little yellow bird is flying in the sky'
grammar = "NP: {<DT>?<JJ>*<NN>}"
chunking(sentence, grammar)

(S
  (NP the/DT little/JJ yellow/JJ bird/NN)
  is/VBZ
  flying/VBG
  in/IN
  (NP the/DT sky/NN))
(NP the/DT little/JJ yellow/JJ bird/NN)
(NP the/DT sky/NN)


### Named Entity Recognition

In [32]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag, ne_chunk
nltk.download('maxent_ne_chunker')
nltk.download('words')

def named_entity_recognition(text):
    word_tokens = word_tokenize(text)
    word_pos = pos_tag(word_tokens)
    
    print(ne_chunk(word_pos))

text = 'Sunbo works for Microsoft so he went to Switzerland for a meetup'
named_entity_recognition(text)

(S
  (GPE Sunbo/NNP)
  works/VBZ
  for/IN
  (PERSON Microsoft/NNP)
  so/RB
  he/PRP
  went/VBD
  to/TO
  (GPE Switzerland/NNP)
  for/IN
  a/DT
  meetup/NN)


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\HP/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to C:\Users\HP/nltk_data...
[nltk_data]   Package words is already up-to-date!
