In [None]:
!pip install spacy

In [None]:
!python -m spacy download en_core_web_sm

# EXP-01( Word Analysis ) 1a

In [48]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "Natural Language Processing is a fascinating field of study."

doc = nlp(text)

tokens = [token.text for token in doc]
lemmas = [token.lemma_ for token in doc]

print("Tokens:", tokens)
print("Lemmas:", lemmas)

print("\nDependency Parsing:")

for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,[child for child in token.children])

Tokens: ['Natural', 'Language', 'Processing', 'is', 'a', 'fascinating', 'field', 'of', 'study', '.']
Lemmas: ['Natural', 'Language', 'Processing', 'be', 'a', 'fascinating', 'field', 'of', 'study', '.']

Dependency Parsing:
Natural compound Language PROPN []
Language compound Processing PROPN [Natural]
Processing nsubj is AUX [Language]
is ROOT is AUX [Processing, field, .]
a det field NOUN []
fascinating amod field NOUN []
field attr is AUX [a, fascinating, of]
of prep field NOUN [study]
study pobj of ADP []
. punct is AUX []


# EXP-02(case study for Word Analysis ) 1b


In [49]:
import spacy

nlp = spacy.load("en_core_web_sm")

customer_feedback = [
 "The product is amazing! I love the quality.",
 "The delivery was late, very frustrating."
]

def analyze_feedback(feedback):
    for idx, text in enumerate(feedback, start=1):
        print(f"\nAnalyzing Feedback {idx}: '{text}'")
        doc = nlp(text)
        tokens = [token.text for token in doc]
        lemmas = [token.lemma_ for token in doc]
        print("Tokens:", tokens)
        print("Lemmas:", lemmas)

        print("\nDependency Parsing:")
        for token in doc:
            print(token.text, token.dep_, token.head.text, token.head.pos_,[child for child in token.children])
if __name__ == "__main__":
    analyze_feedback(customer_feedback)



Analyzing Feedback 1: 'The product is amazing! I love the quality.'
Tokens: ['The', 'product', 'is', 'amazing', '!', 'I', 'love', 'the', 'quality', '.']
Lemmas: ['the', 'product', 'be', 'amazing', '!', 'I', 'love', 'the', 'quality', '.']

Dependency Parsing:
The det product NOUN []
product nsubj is AUX [The]
is ROOT is AUX [product, amazing, !]
amazing acomp is AUX []
! punct is AUX []
I nsubj love VERB []
love ROOT love VERB [I, quality, .]
the det quality NOUN []
quality dobj love VERB [the]
. punct love VERB []

Analyzing Feedback 2: 'The delivery was late, very frustrating.'
Tokens: ['The', 'delivery', 'was', 'late', ',', 'very', 'frustrating', '.']
Lemmas: ['the', 'delivery', 'be', 'late', ',', 'very', 'frustrating', '.']

Dependency Parsing:
The det delivery NOUN []
delivery nsubj was AUX [The]
was ROOT was AUX [delivery, frustrating, .]
late advmod frustrating ADJ []
, punct frustrating ADJ []
very advmod frustrating ADJ []
frustrating acomp was AUX [late, ,, very]
. punct was 

# EXP-03(Word Generation ) 2a

In [50]:
!pip install nltk
import nltk
import random

nltk.download('punkt')
nltk.download('gutenberg')

words = nltk.corpus.gutenberg.words()

bigrams = list(nltk.bigrams(words))

starting_word = "the"
generated_text = [starting_word]

for _ in range(20):
    possible_words = [word2 for (word1, word2) in bigrams if word1.lower() == generated_text[-1].lower()]
    next_word = random.choice(possible_words)
    generated_text.append(next_word)
print(' '.join(generated_text))




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


the Lord GOD , whither goest to fear the LORD caused me saw , my father Brown . I see anyone


# EXP-04(case study for word generation) 2b


In [52]:
!pip install transformers
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

class EmailAutocompleteSystem:
    def __init__(self):
        self.model_name = "gpt2"
        self.tokenizer = GPT2Tokenizer.from_pretrained(self.model_name)
        self.model = GPT2LMHeadModel.from_pretrained(self.model_name)
    
    def generate_suggestions(self, user_input, context):
        input_text = f"{context} {user_input}"
        input_ids = self.tokenizer.encode(input_text, return_tensors="pt")
        with torch.no_grad():
            output = self.model.generate(input_ids, max_length=50, num_return_sequences=1, no_repeat_ngram_size=2)
            generated_text = self.tokenizer.decode(output[0], skip_special_tokens=True)
            suggestions = generated_text.split()[len(user_input.split()):]
        return suggestions

if __name__ == "__main__":
    autocomplete_system = EmailAutocompleteSystem()
    
    email_context = "Subject: Discussing Project Proposal\nHi [Recipient],"

    while True:
        user_input = input("Enter your sentence (type 'exit' to end): ")

        if user_input.lower() == 'exit':
            break

        suggestions = autocomplete_system.generate_suggestions(user_input, email_context)

        if suggestions:
            print("Autocomplete Suggestions:", suggestions)
        else:
            print("No suggestions available.")


Enter your sentence (type 'exit' to end): hi hoe are you


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Autocomplete Suggestions: ['Hi', '[Recipient],', 'hi', 'hoe', 'are', 'you', 'interested', 'in', 'working', 'on', 'a', 'project', 'that', 'is', 'not', 'a', 'part', 'of', 'the', 'project?', 'I', 'am', 'interested', 'to', 'hear', 'your', 'thoughts', 'on', 'the', 'topic.', 'I', 'have', 'been', 'working', 'with', 'the']
Enter your sentence (type 'exit' to end): exit


# EXP-05(TEXT CLASSIFICATION) 3a


In [53]:
!pip install scikit-learn
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
data = {
 'text': [
 'This is a positive sentence',
 'I am happy today',
 'Negative review, very bad service',
 'I do not like this product'
 ],
 'label': ['positive', 'positive', 'negative', 'negative']
}
df = pd.DataFrame(data)
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2,
random_state=42)
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
svm_classifier = SVC(kernel='linear')

svm_classifier.fit(X_train_tfidf, y_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
y_pred = svm_classifier.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))


Accuracy: 0.00
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       0.0
    positive       0.00      0.00      0.00       1.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# EXP-06(case study for TEXT CLASSIFICATION) 3b


In [54]:
!pip install scikit-learn

import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

categories = ['sci.med', 'sci.space', 'comp.graphics', 'talk.politics.mideast']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

model = make_pipeline(TfidfVectorizer(),LinearSVC())

model.fit(X_train, y_train)

predictions = model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, predictions))

Accuracy: 0.9504823151125402

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.97      0.93       389
           1       0.96      0.91      0.94       396
           2       0.98      0.94      0.96       394
           3       0.98      0.98      0.98       376

    accuracy                           0.95      1555
   macro avg       0.95      0.95      0.95      1555
weighted avg       0.95      0.95      0.95      1555



# EXP-07(semantic analysis) 4a


In [55]:
!pip install gensim
!pip install nltk

import gensim.downloader as api
from nltk.tokenize import word_tokenize


word_vectors = api.load("word2vec-google-news-300")

sentences = [
"Natural language processing is a challenging but fascinating field."]

tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

for tokenized_sentence in tokenized_sentences:
    for word in tokenized_sentence:
        if word in word_vectors:
            similar_words = word_vectors.most_similar(word)
            print(f"Words similar to '{word}': {similar_words}")
        else:
            print(f"'{word}' is not in the pre-trained Word2Vec model.")


Words similar to 'natural': [('Splittorff_lacked', 0.636509358882904), ('Natural', 0.58078932762146), ('Mike_Taugher_covers', 0.577259361743927), ('manmade', 0.5276211500167847), ('shell_salted_pistachios', 0.5084421634674072), ('unnatural', 0.5030758380889893), ('naturally', 0.49992606043815613), ('Intraparty_squabbles', 0.4988228678703308), ('Burt_Bees_®', 0.49539363384246826), ('causes_Buxeda', 0.4935200810432434)]
Words similar to 'language': [('langauge', 0.7476695775985718), ('Language', 0.6695356369018555), ('languages', 0.6341332197189331), ('English', 0.6120712757110596), ('CMPB_Spanish', 0.6083104610443115), ('nonnative_speakers', 0.6063109636306763), ('idiomatic_expressions', 0.5889801979064941), ('verb_tenses', 0.58415687084198), ('Kumeyaay_Diegueno', 0.5798824429512024), ('dialect', 0.5724600553512573)]
Words similar to 'processing': [('Processing', 0.7285515666007996), ('processed', 0.6519132852554321), ('processor', 0.636760413646698), ('warden_Dominick_DeRose', 0.616652

In [58]:
!jupyter notebook --generate-config

^C


# EXP-08(case study for semantic analysis) 4b


In [66]:
import nltk
nltk.download('omw-1.4')

from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def semantic_analysis(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    synonyms = set()
    
    for token in lemmatized_tokens:
        for syn in wordnet.synsets(token):
            for lemma in syn.lemmas():
                synonyms.add(lemma.name())
    return list(synonyms)
customer_queries = [
    "I received a damaged product. Can I get a refund?",
    "I'm having trouble accessing my account.",

]
for query in customer_queries:
    print("Customer Query:", query)
    synonyms = semantic_analysis(query)
    print("Semantic Analysis (Synonyms):", synonyms)
    print("\n")


Customer Query: I received a damaged product. Can I get a refund?
Semantic Analysis (Synonyms): ['amaze', 'refund', 'merchandise', 'perplex', 'stupefy', 'damage', 'capture', 'welcome', 'product', 'receive', 'bewilder', 'buzz_off', 'acquire', 'Cartesian_product', 'suffer', 'scram', 'beat', 'experience', 'get_down', 'begin', 'cause', 'puzzle', 'pick_up', 'fuck_off', 'make', 'pay_off', 'invite', 'encounter', 'take', 'incur', 'grow', 'nonplus', 'bring_forth', 'have', 'sustain', 'generate', 'mystify', 'damaged', 'discredited', 'vex', 'stimulate', 'start_out', 'take_in', 'become', 'contract', 'mother', 'baffle', 'aim', 'bugger_off', 'commence', 'beget', 'pay_back', 'pose', 'start', 'production', 'gravel', 'give_back', 'get', 'let', 'received', 'go', 'induce', 'develop', 'father', 'arrest', 'bring', 'fix', 'dumbfound', 'repay', 'flummox', 'meet', 'draw', 'fetch', "get_under_one's_skin", 'catch', 'obtain', 'return', 'set_about', 'produce', 'ware', 'set_out', 'engender', 'drive', 'find', 'sire'

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# EXP-09 (sentiment analysis) 5a

In [67]:
!pip install scikit-learn
!pip install nltk

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import movie_reviews 

import nltk
nltk.download('movie_reviews')

documents = [(list(movie_reviews.words(fileid)), category)
 for category in movie_reviews.categories()
 for fileid in movie_reviews.fileids(category)]

df = pd.DataFrame(documents, columns=['text', 'sentiment'])

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size=0.2,random_state=42)
tfidf_vectorizer = TfidfVectorizer()

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train.apply(' '.join))

svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_tfidf, y_train)

X_test_tfidf = tfidf_vectorizer.transform(X_test.apply(' '.join))

y_pred = svm_classifier.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print(classification_report(y_test, y_pred))



[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Accuracy: 0.84
              precision    recall  f1-score   support

         neg       0.83      0.85      0.84       199
         pos       0.85      0.82      0.84       201

    accuracy                           0.84       400
   macro avg       0.84      0.84      0.84       400
weighted avg       0.84      0.84      0.84       400



# EXP-10(case study for sentiment analysis) 5b


In [68]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

reviews = [
"This product is amazing! I love it.",
"The product was good, but the packaging was damaged.",
"Very disappointing experience. Would not recommend.",
"Neutral feedback on the product.",
]

sid = SentimentIntensityAnalyzer()

for review in reviews:
    print("Review:", review)
    scores = sid.polarity_scores(review)
print("Sentiment:", end=' ')
if scores['compound'] > 0.05:
    print("Positive")
elif scores['compound'] < -0.05:
    print("Negative")
else:
    print("Neutral")
    print()


Review: This product is amazing! I love it.
Review: The product was good, but the packaging was damaged.
Review: Very disappointing experience. Would not recommend.
Review: Neutral feedback on the product.
Sentiment: Neutral



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# EXP-11(pos tagging) 6a

In [69]:
!pip install nltk

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

text = "Parts of speech tagging helps to understand the function of each word in a sentence."

tokens = nltk.word_tokenize(text)

pos_tags = nltk.pos_tag(tokens)

print("POS tags:", pos_tags)

POS tags: [('Parts', 'NNS'), ('of', 'IN'), ('speech', 'NN'), ('tagging', 'VBG'), ('helps', 'NNS'), ('to', 'TO'), ('understand', 'VB'), ('the', 'DT'), ('function', 'NN'), ('of', 'IN'), ('each', 'DT'), ('word', 'NN'), ('in', 'IN'), ('a', 'DT'), ('sentence', 'NN'), ('.', '.')]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# EXP-12(case study for pos tagging) 6b

In [70]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def pos_tagging(text):
    sentences = sent_tokenize(text)
    tagged_tokens = []
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        tagged_tokens.extend(nltk.pos_tag(tokens))
    return tagged_tokens

def main():
    article_text = """
    The victory boosts United's chances in the Premier League title race.
    """
    tagged_tokens = pos_tagging(article_text)
    print("Original Article Text:\n", article_text)
    print("\nParts of Speech Tagging:")
    for token, pos_tag in tagged_tokens:
        print(f"{token}: {pos_tag}")

if __name__ == "__main__":
    main()


Original Article Text:
 
    The victory boosts United's chances in the Premier League title race.
    

Parts of Speech Tagging:
The: DT
victory: NN
boosts: VBZ
United: NNP
's: POS
chances: NNS
in: IN
the: DT
Premier: NNP
League: NNP
title: NN
race: NN
.: .


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# EXP-13(Chunking) 7a

In [71]:
!pip install nltk
import nltk
from nltk import RegexpParser
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

sentence = "The quick brown fox jumps over the lazy dog"

tokens = word_tokenize(sentence)

tagged = pos_tag(tokens)


chunk_grammar = r"""
 NP: {<DT>?<JJ>*<NN>}
"""

chunk_parser = RegexpParser(chunk_grammar)


chunks = chunk_parser.parse(tagged)

for subtree in chunks.subtrees():
    if subtree.label() == 'NP':
        print(subtree)

(NP The/DT quick/JJ brown/NN)
(NP fox/NN)
(NP the/DT lazy/JJ dog/NN)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# EXP-14(case study for chunking) 7b

In [72]:
import nltk
import os

nltk.data.path.append("/usr/local/share/nltk_data")

nltk.download('punkt')

nltk.download('averaged_perceptron_tagger')

text = "The quick brown fox jumps over the lazy dog."

words = nltk.word_tokenize(text)

pos_tags = nltk.pos_tag(words)

chunk_grammar = r"""
NP: {<DT>?<JJ>*<NN>} # Chunk sequences of DT, JJ, NN
"""

chunk_parser = nltk.RegexpParser(chunk_grammar)

chunked_text = chunk_parser.parse(pos_tags)

noun_phrases = []
for subtree in chunked_text.subtrees(filter=lambda t: t.label() =='NP'):
    noun_phrases.append(' '.join(word for word, tag in subtree.leaves()))

print("Original Text:", text)
print("Noun Phrases:")
for phrase in noun_phrases:
    print("-", phrase)

Original Text: The quick brown fox jumps over the lazy dog.
Noun Phrases:
- The quick brown
- fox
- the lazy dog


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
