In [8]:
!jupyter nbconvert --ClearMetadataPreprocessor.enabled=True --inplace /content/*.ipynb

This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr

In [9]:
text = "Natural Language Processing (NLP) enables computers to understand human language. It's fascinating!"

In [10]:
# !pip install nltk spacy
# !python -m spacy download en_core_web_sm
# !pip install indic-nlp-library
# !pip install nltk
# !pip install stanza

In [11]:
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

text = text.lower()
tokens = word_tokenize(text)
tokens = [word for word in tokens if word not in string.punctuation]

stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]

lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

print("NLTK Tokens:", lemmatized_tokens)

NLTK Tokens: ['natural', 'language', 'processing', 'nlp', 'enables', 'computer', 'understand', 'human', 'language', "'s", 'fascinating']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
import spacy

nlp = spacy.load("en_core_web_sm")

doc = nlp(text.lower())
tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

print("spaCy Tokens:", tokens)

spaCy Tokens: ['natural', 'language', 'processing', 'nlp', 'enable', 'computer', 'understand', 'human', 'language', 'fascinating']


In [13]:
from indicnlp.tokenize import indic_tokenize
from nltk.corpus import stopwords
import string

text = "नैसर्गिक भाषा प्रक्रिया संगणकांना मानवी भाषा समजावून देण्याची क्षमता देते."

# Tokenization using Indic NLP
tokens = indic_tokenize.trivial_tokenize(text)

# Defining Marathi stopwords
marathi_stopwords = set([
    'आणि', 'होते', 'तो', 'ती', 'ते', 'ची', 'च्या', 'करून', 'आहे', 'या', 'असणे', 'साठी', 'म्हणून'
])

# Remove punctuation
tokens = [word for word in tokens if word not in string.punctuation]

# Remove stopwords
filtered_tokens = [word for word in tokens if word not in marathi_stopwords]

print("Tokens after stopword removal:", filtered_tokens)

Tokens after stopword removal: ['नैसर्गिक', 'भाषा', 'प्रक्रिया', 'संगणकांना', 'मानवी', 'भाषा', 'समजावून', 'देण्याची', 'क्षमता', 'देते']


In [14]:
import stanza

stanza.download('mr')  # Marathi model
nlp = stanza.Pipeline('mr')

text = "नैसर्गिक भाषा प्रक्रिया संगणकांना मानवी भाषा समजावून देण्याची क्षमता देते."
doc = nlp(text)

for sentence in doc.sentences:
    for word in sentence.words:
        print(f"Word: {word.text}\tLemma: {word.lemma}\tPOS: {word.upos}")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: mr (Marathi) ...
INFO:stanza:File exists: /root/stanza_resources/mr/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: mr (Marathi):
| Processor | Package       |
-----------------------------
| tokenize  | ufal          |
| mwt       | ufal          |
| pos       | ufal_charlm   |
| lemma     | ufal_nocharlm |
| depparse  | ufal_charlm   |
| sentiment | l3cube_charlm |
| ner       | l3cube        |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Loading: sentiment
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


Word: नैसर्गिक	Lemma: नैसर्गिक	POS: ADJ
Word: भाषा	Lemma: भाष	POS: NOUN
Word: प्रक्रिया	Lemma: प्रक्रा	POS: NOUN
Word: संगणकांना	Lemma: संगणक	POS: NOUN
Word: मानवी	Lemma: मानवी	POS: ADJ
Word: भाषा	Lemma: भाष	POS: NOUN
Word: समजावून	Lemma: समजावणे	POS: VERB
Word: देण्या	Lemma: देणे	POS: VERB
Word: ची	Lemma: चा	POS: ADP
Word: क्षमता	Lemma: क्षमता	POS: NOUN
Word: देते	Lemma: देणे	POS: VERB
Word: .	Lemma: .	POS: PUNCT
