<a href="https://colab.research.google.com/github/Yechinalokesh/app/blob/main/WSMA_EXP_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# Install nltk if not already installed
!pip install nltk

# Import necessary libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.probability import FreqDist

# Download datasets required by nltk
nltk.download('punkt', quiet=True)            # To split text into words
nltk.download('stopwords', quiet=True)        # To remove common words like "is", "the"
nltk.download('wordnet', quiet=True)          # For lemmatization
nltk.download('averaged_perceptron_tagger', quiet=True)  # For tagging grammar roles
nltk.download('averaged_perceptron_tagger_eng', quiet=True)  # Download alternate tagger just in case

print("All libraries imported and datasets downloaded successfully!\n")

# Load the text you want to process
text = "Natural Language Processing is a field of computer science, artificial intelligence and computational linguistics."
print("Original Text:")
print(text, "\n")

# Step 1 – Tokenization: Split the text into words and punctuation
tokens = word_tokenize(text)
print("Tokens:")
print(tokens, "\n")

# Step 2 – Stopword Elimination: Remove common words like "is", "a", "of"
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("After Stopword Removal:")
print(filtered_tokens, "\n")

# Step 3 – Stemming: Reduce words to their root form
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_tokens]
print("After Stemming:")
print(stemmed_words, "\n")

# Step 4 – Lemmatization: Convert words to their dictionary form
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("After Lemmatization:")
print(lemmatized_words, "\n")

# Step 5 – POS Tagging: Identify parts of speech for each word
pos_tags = pos_tag(filtered_tokens)
print("POS Tags:")
print(pos_tags, "\n")

# Step 6 – Lexical Analysis: Count frequency of each word
freq = FreqDist(filtered_tokens)
print("Word Frequency:")
print(freq.most_common())


All libraries imported and datasets downloaded successfully!

Original Text:
Natural Language Processing is a field of computer science, artificial intelligence and computational linguistics. 

Tokens:
['Natural', 'Language', 'Processing', 'is', 'a', 'field', 'of', 'computer', 'science', ',', 'artificial', 'intelligence', 'and', 'computational', 'linguistics', '.'] 

After Stopword Removal:
['Natural', 'Language', 'Processing', 'field', 'computer', 'science', ',', 'artificial', 'intelligence', 'computational', 'linguistics', '.'] 

After Stemming:
['natur', 'languag', 'process', 'field', 'comput', 'scienc', ',', 'artifici', 'intellig', 'comput', 'linguist', '.'] 

After Lemmatization:
['Natural', 'Language', 'Processing', 'field', 'computer', 'science', ',', 'artificial', 'intelligence', 'computational', 'linguistics', '.'] 

POS Tags:
[('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('field', 'NN'), ('computer', 'NN'), ('science', 'NN'), (',', ','), ('artificial', 'JJ')