# NLP Basic Concept
## Lexical Analysis
### 1. Tokenization
Breaks down raw text into smaller, meaningful units called tokens (words, subwords, or characters)

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/adam/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [23]:
text = "NLTK tokenizing is a crucial step in NLP. It is widely used."

sentences = sent_tokenize(text)
words = word_tokenize(text)

print("Sentences:", sentences)
print("Words:", words)

Sentences: ['NLTK tokenizing is a crucial step in NLP.', 'It is widely used.']
Words: ['NLTK', 'tokenizing', 'is', 'a', 'crucial', 'step', 'in', 'NLP', '.', 'It', 'is', 'widely', 'used', '.']


### 2. Case folding
Converts all characters in a text to a single case (usually lowercase)

In [25]:
print(text.lower())

nltk tokenizing is a crucial step in nlp. it is widely used.


### 3. Punctuation Removal
Only retain the important word by removing punctuations

In [26]:
import string
teks = "Hello!!! Are you there??? :)"
print(''.join([char for char in teks if char not in string.punctuation]))

Hello Are you there 


### 4. Stop word removal
Filters out common, less meaningful words (like "the," "is," "a") to reduce noise

In [27]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/adam/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
words = word_tokenize("This is an example of stop word removal.")
print([word for word in words if word.lower() not in stopwords.words('english')])

['example', 'stop', 'word', 'removal', '.']


### 5. Abbreviations Handling

In [29]:
import re

In [30]:
text = "Dr Smith is an M.D from U.S."
abbrev_cleaned = re.sub(r'\b(Dr|Mr|Ms|M\.D|U\.S)\.', lambda x : x.group(0).replace('.', ''), text)
print(abbrev_cleaned)


Dr Smith is an M.D from US


In [31]:
sent = "Prof John lives in the U.K. and works at M.I.T."
fixed_sent = re.sub(r'\b([A-Z])\.', r'\1', sent)
print(fixed_sent)

Prof John lives in the UK and works at MIT


### 6. Stemming
Chops off word endings (suffixes/prefixes) to reduce words to their common "stem" or root form

In [32]:
from nltk.stem import PorterStemmer

In [33]:
stemmer = PorterStemmer()
words = ['running', 'runs', 'runner']
print([stemmer.stem(word) for word in words])

['run', 'run', 'runner']


### 7. Part-of-speech tagging
Assigning grammatical categories (like noun, verb, adjective) to each word in a text

In [34]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/adam/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [35]:
tokens = word_tokenize("The quick brown fox jumps over the lazy dog.")
print(nltk.pos_tag(tokens))

[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.')]


### 8. Word Sense Disambiguition
Identifying the correct meaning (sense) of a polysemous word (a word with multiple meanings) in a specific context

In [36]:
nltk.download('wordnet')
from nltk.wsd import lesk
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package wordnet to /home/adam/nltk_data...


In [37]:
sentence = "I went to the bank to deposit money."
synset1 = lesk(word_tokenize(sentence), "bank")
print(synset1, synset1.definition())
print(synset1)

Synset('depository_financial_institution.n.01') a financial institution that accepts deposits and channels the money into lending activities
Synset('depository_financial_institution.n.01')


In [38]:
sentence2 = "The book is full of notes."
synset2 = lesk(word_tokenize(sentence2), "book")
print(synset2, synset2.definition())
print(synset2)

Synset('book.n.02') physical objects consisting of a number of pages bound together
Synset('book.n.02')


### 9. Translation

In [39]:
from deep_translator import GoogleTranslator

In [41]:
# example 1
translated1 = GoogleTranslator(source='auto', target='ja').translate("Hello, how are you?")
print(translated1)

# example 2
translated2 = GoogleTranslator(source='auto', target='ja').translate("Hi")
print(translated2)

user_text = input("Enter text to translate: ")
translated_text = GoogleTranslator(source='auto', target='zh-CN').translate(user_text)
print(translated_text)

こんにちは お元気ですか？
こんにちは
龙被释放！


### 10. Name Entity Recognition
Finding and classifying real-world entities in text, like people, organizations, locations, dates, and monetary values, into predefined categories

In [42]:
nltk.download('maxent_ne_chunker_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('words')
from nltk import word_tokenize, pos_tag, ne_chunk

[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /home/adam/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/adam/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to /home/adam/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [43]:
sentence = "Barack Obama was born in Hawaii."
tree = ne_chunk(pos_tag(word_tokenize(sentence)))
print(tree)

(S
  (PERSON Barack/NNP)
  (PERSON Obama/NNP)
  was/VBD
  born/VBN
  in/IN
  (GPE Hawaii/NNP)
  ./.)
