In [1]:
text = "Natural Language Processing (NLP) enables computers to understand human language. It's fascinating!"

In [2]:
!pip install nltk spacy
!python -m spacy download en_core_web_sm
!pip install indic-nlp-library
!pip install nltk
!pip install stanza

Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

text = text.lower()
tokens = word_tokenize(text)
tokens = [word for word in tokens if word not in string.punctuation]

stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]

lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

print("NLTK Tokens:", lemmatized_tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


NLTK Tokens: ['natural', 'language', 'processing', 'nlp', 'enables', 'computer', 'understand', 'human', 'language', "'s", 'fascinating']


In [4]:
import spacy

nlp = spacy.load("en_core_web_sm")

doc = nlp(text.lower())
tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

print("spaCy Tokens:", tokens)

spaCy Tokens: ['natural', 'language', 'processing', 'nlp', 'enable', 'computer', 'understand', 'human', 'language', 'fascinating']


In [5]:
from indicnlp.tokenize import indic_tokenize
from nltk.corpus import stopwords
import string

text = "नैसर्गिक भाषा प्रक्रिया संगणकांना मानवी भाषा समजावून देण्याची क्षमता देते."

# Tokenization using Indic NLP
tokens = indic_tokenize.trivial_tokenize(text)

# Defining Marathi stopwords
marathi_stopwords = set([
    'आणि', 'होते', 'तो', 'ती', 'ते', 'ची', 'च्या', 'करून', 'आहे', 'या', 'असणे', 'साठी', 'म्हणून'
])

# Remove punctuation
tokens = [word for word in tokens if word not in string.punctuation]

# Remove stopwords
filtered_tokens = [word for word in tokens if word not in marathi_stopwords]

print("Tokens after stopword removal:", filtered_tokens)

Tokens after stopword removal: ['नैसर्गिक', 'भाषा', 'प्रक्रिया', 'संगणकांना', 'मानवी', 'भाषा', 'समजावून', 'देण्याची', 'क्षमता', 'देते']


In [6]:
import stanza

stanza.download('mr')  # Marathi model
nlp = stanza.Pipeline('mr')

text = "नैसर्गिक भाषा प्रक्रिया संगणकांना मानवी भाषा समजावून देण्याची क्षमता देते."
doc = nlp(text)

for sentence in doc.sentences:
    for word in sentence.words:
        print(f"Word: {word.text}\tLemma: {word.lemma}\tPOS: {word.upos}")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: mr (Marathi) ...


Downloading https://huggingface.co/stanfordnlp/stanza-mr/resolve/v1.10.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/mr/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: mr (Marathi):
| Processor | Package       |
-----------------------------
| tokenize  | ufal          |
| mwt       | ufal          |
| pos       | ufal_charlm   |
| lemma     | ufal_nocharlm |
| depparse  | ufal_charlm   |
| sentiment | l3cube_charlm |
| ner       | l3cube        |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Loading: sentiment
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


Word: नैसर्गिक	Lemma: नैसर्गिक	POS: ADJ
Word: भाषा	Lemma: भाष	POS: NOUN
Word: प्रक्रिया	Lemma: प्रक्रा	POS: NOUN
Word: संगणकांना	Lemma: संगणक	POS: NOUN
Word: मानवी	Lemma: मानवी	POS: ADJ
Word: भाषा	Lemma: भाष	POS: NOUN
Word: समजावून	Lemma: समजावणे	POS: VERB
Word: देण्या	Lemma: देणे	POS: VERB
Word: ची	Lemma: चा	POS: ADP
Word: क्षमता	Lemma: क्षमता	POS: NOUN
Word: देते	Lemma: देणे	POS: VERB
Word: .	Lemma: .	POS: PUNCT
