# NLP Basic Concept
## Lexical Analysis
### 1. Tokenization
Breaks down raw text into smaller, meaningful units called tokens (words, subwords, or characters)

In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/adam/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
text = "NLTK tokenizing is a crucial step in NLP. It is widely used."

sentences = sent_tokenize(text)
words = word_tokenize(text)

print("Sentences:", sentences)
print("Words:", words)

Sentences: ['NLTK tokenizing is a crucial step in NLP.', 'It is widely used.']
Words: ['NLTK', 'tokenizing', 'is', 'a', 'crucial', 'step', 'in', 'NLP', '.', 'It', 'is', 'widely', 'used', '.']


### 2. Case folding
Converts all characters in a text to a single case (usually lowercase)

In [3]:
print(text.lower())

nltk tokenizing is a crucial step in nlp. it is widely used.


### 3. Punctuation Removal
Only retain the important word by removing punctuations

In [4]:
import string
teks = "Hello!!! Are you there??? :)"
print(''.join([char for char in teks if char not in string.punctuation]))

Hello Are you there 


### 4. Stop word removal
Filters out common, less meaningful words (like "the," "is," "a") to reduce noise

In [5]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/adam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
words = word_tokenize("This is an example of stop word removal.")
print([word for word in words if word.lower() not in stopwords.words('english')])

['example', 'stop', 'word', 'removal', '.']


### 5. Abbreviations Handling

In [7]:
import re

In [8]:
text = "Dr Smith is an M.D from U.S."
abbrev_cleaned = re.sub(r'\b(Dr|Mr|Ms|M\.D|U\.S)\.', lambda x : x.group(0).replace('.', ''), text)
print(abbrev_cleaned)


Dr Smith is an M.D from US


In [9]:
sent = "Prof John lives in the U.K. and works at M.I.T."
fixed_sent = re.sub(r'\b([A-Z])\.', r'\1', sent)
print(fixed_sent)

Prof John lives in the UK and works at MIT


### 6. Stemming
Chops off word endings (suffixes/prefixes) to reduce words to their common "stem" or root form

In [10]:
from nltk.stem import PorterStemmer

In [11]:
stemmer = PorterStemmer()
words = ['running', 'runs', 'runner']
print([stemmer.stem(word) for word in words])

['run', 'run', 'runner']


### 7. Part-of-speech tagging
Assigning grammatical categories (like noun, verb, adjective) to each word in a text

In [12]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/adam/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [13]:
tokens = word_tokenize("The quick brown fox jumps over the lazy dog.")
print(nltk.pos_tag(tokens))

[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.')]


In [14]:
import spacy

# Load the small English model
nlp = spacy.load("en_core_web_sm")

text = "Google is looking at buying a startup in London for $1 billion."
doc = nlp(text)

# Print word and its POS tag
for token in doc:
    print(f"{token.text:12} | {token.pos_:6} | {token.dep_}")

Google       | PROPN  | nsubj
is           | AUX    | aux
looking      | VERB   | ROOT
at           | ADP    | prep
buying       | VERB   | pcomp
a            | DET    | det
startup      | NOUN   | dobj
in           | ADP    | prep
London       | PROPN  | pobj
for          | ADP    | prep
$            | SYM    | quantmod
1            | NUM    | compound
billion      | NUM    | pobj
.            | PUNCT  | punct


### 8. Word Sense Disambiguition
Identifying the correct meaning (sense) of a polysemous word (a word with multiple meanings) in a specific context

In [15]:
nltk.download('wordnet')
from nltk.wsd import lesk
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package wordnet to /home/adam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
sentence = "I went to the bank to deposit money."
synset1 = lesk(word_tokenize(sentence), "bank")
print(synset1, synset1.definition())
print(synset1)

Synset('depository_financial_institution.n.01') a financial institution that accepts deposits and channels the money into lending activities
Synset('depository_financial_institution.n.01')


In [17]:
sentence2 = "The book is full of notes."
synset2 = lesk(word_tokenize(sentence2), "book")
print(synset2, synset2.definition())
print(synset2)

Synset('book.n.02') physical objects consisting of a number of pages bound together
Synset('book.n.02')


### 9. Translation

In [18]:
from deep_translator import GoogleTranslator

In [19]:
# example 1
translated1 = GoogleTranslator(source='auto', target='ja').translate("Hello, how are you?")
print(translated1)

# example 2
translated2 = GoogleTranslator(source='auto', target='ja').translate("Hi")
print(translated2)

user_text = input("Enter text to translate: ")
translated_text = GoogleTranslator(source='auto', target='zh-CN').translate(user_text)
print(translated_text)

こんにちは お元気ですか？
こんにちは
龙


### 10. Name Entity Recognition
Finding and classifying real-world entities in text, like people, organizations, locations, dates, and monetary values, into predefined categories

In [20]:
nltk.download('maxent_ne_chunker_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('words')
from nltk import word_tokenize, pos_tag, ne_chunk

[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /home/adam/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/adam/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to /home/adam/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [21]:
sentence = "Barack Obama was born in Hawaii."
tree = ne_chunk(pos_tag(word_tokenize(sentence)))
print(tree)

(S
  (PERSON Barack/NNP)
  (PERSON Obama/NNP)
  was/VBD
  born/VBN
  in/IN
  (GPE Hawaii/NNP)
  ./.)


In [22]:
# Using the same 'doc' from the POS example
print("Entities found:")
for ent in doc.ents:
    print(f"{ent.text:15} | {ent.label_}")

# Pro-tip: Visualizing it inside a notebook
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

Entities found:
Google          | ORG
London          | GPE
$1 billion      | MONEY


### 11. Vectorization: Bag of Words (CountVectorizer)
It creates a matrix where each row is a document and each column is a word count

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
corpus = [
    "NLP is great for text analysis.",
    "Text analysis is a core part of data science.",
    "I love learning about NLP and data science."
]

# Initialize and transform
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

# Convert to a readable DataFrame
df_bow = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
print(df_bow)

   about  analysis  and  core  data  for  great  is  learning  love  nlp  of  \
0      0         1    0     0     0    1      1   1         0     0    1   0   
1      0         1    0     1     1    0      0   1         0     0    0   1   
2      1         0    1     0     1    0      0   0         1     1    1   0   

   part  science  text  
0     0        0     1  
1     1        1     1  
2     0        1     0  


### 12. Term Frequency-Inverse Document Frequency (TF-IDF)
TF-IDF is smarter than Bag of Words. It rewards rare, meaningful words (like "NLP") and penalizes common words (like "is").

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
X_tfidf = tfidf_vec.fit_transform(corpus)

# Show the weights (numbers closer to 1 are more "important" words)
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vec.get_feature_names_out())
print(df_tfidf.head())

      about  analysis       and      core      data       for     great  \
0  0.000000  0.366180  0.000000  0.000000  0.000000  0.481482  0.481482   
1  0.000000  0.313316  0.000000  0.411973  0.313316  0.000000  0.000000   
2  0.417567  0.000000  0.417567  0.000000  0.317570  0.000000  0.000000   

         is  learning      love      nlp        of      part   science  \
0  0.366180  0.000000  0.000000  0.36618  0.000000  0.000000  0.000000   
1  0.313316  0.000000  0.000000  0.00000  0.411973  0.411973  0.313316   
2  0.000000  0.417567  0.417567  0.31757  0.000000  0.000000  0.317570   

       text  
0  0.366180  
1  0.313316  
2  0.000000  
