# Experiment 2
## Perform Preprocessing steps in NLP.
_Sub-objectives_
1. Apply basic preprocessing: tokenization, stopword removal, etc.
2. Understand the importance of text normalization.
3. Compare NLP preprocessing tools (e.g., NLTK, spaCy).

In [None]:
text = "Natural Language Processing (NLP) enables computers to understand human language. It's fascinating!"

In [None]:
# !pip install nltk spacy
# !python -m spacy download en_core_web_sm
# !pip install indic-nlp-library
# !pip install nltk
# !pip install stanza

In [None]:
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Lowercase
text = text.lower()

# Tokenization
tokens = word_tokenize(text)

# Remove punctuation
tokens = [word for word in tokens if word not in string.punctuation]

# Stopword removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

print("NLTK Tokens:", lemmatized_tokens)

In [None]:
import spacy

# Load English model
nlp = spacy.load("en_core_web_sm")

# Apply preprocessing
doc = nlp(text.lower())

# Filter tokens
tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

print("spaCy Tokens:", tokens)

In [None]:
from indicnlp.tokenize import indic_tokenize
from nltk.corpus import stopwords
import string

# Sample Marathi text
text = "नैसर्गिक भाषा प्रक्रिया संगणकांना मानवी भाषा समजावून देण्याची क्षमता देते."

# Tokenization using Indic NLP
tokens = indic_tokenize.trivial_tokenize(text)

# Define Marathi stopwords (You may create or find a list)
marathi_stopwords = set([
    'आणि', 'होते', 'तो', 'ती', 'ते', 'ची', 'च्या', 'करून', 'आहे', 'या', 'असणे', 'साठी', 'म्हणून'
])

# Remove punctuation
tokens = [word for word in tokens if word not in string.punctuation]

# Remove stopwords
filtered_tokens = [word for word in tokens if word not in marathi_stopwords]

print("Tokens after stopword removal:", filtered_tokens)


In [None]:
import stanza

stanza.download('mr')  # Marathi model
nlp = stanza.Pipeline('mr')

text = "नैसर्गिक भाषा प्रक्रिया संगणकांना मानवी भाषा समजावून देण्याची क्षमता देते."
doc = nlp(text)

for sentence in doc.sentences:
    for word in sentence.words:
        print(f"Word: {word.text}\tLemma: {word.lemma}\tPOS: {word.upos}")