In [6]:
# 1
import nltk 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
nltk.download('averaged_perceptron_tagger')
text="Natural Language Processing is a fascinating field of AI." 
tokens= word_tokenize(text) 
stop_words=set(stopwords.words('english')) 
stop_words_tokens=[word for word in tokens if word.lower() in stop_words] 
print("stop_words:", stop_words_tokens) 
pos_tags =nltk.pos_tag(stop_words_tokens) 
print("POS Tags of stop Words:", pos_tags) 

stop_words: ['is', 'a', 'of']
POS Tags of stop Words: [('is', 'VBZ'), ('a', 'DT'), ('of', 'IN')]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Ashutosh
[nltk_data]     Singhania\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [8]:
 
from sklearn.feature_extraction.text import TfidfVectorizer 
docs= ["Natural Language processing is amazing.", 
       "Machine learning and NLP go hand in hand.", 
       "TF-IDF helps find important words in a document."] 
vec= TfidfVectorizer() 
tfidf_mat= vec.fit_transform(docs) 
print("Feature Names:", vec.get_feature_names_out()) 
print("TF-IDF Matrix\n", tfidf_mat.toarray()) 

Feature Names: ['amazing' 'and' 'document' 'find' 'go' 'hand' 'helps' 'idf' 'important'
 'in' 'is' 'language' 'learning' 'machine' 'natural' 'nlp' 'processing'
 'tf' 'words']
TF-IDF Matrix
 [[0.4472136  0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.4472136  0.4472136
  0.         0.         0.4472136  0.         0.4472136  0.
  0.        ]
 [0.         0.32311233 0.         0.         0.32311233 0.64622465
  0.         0.         0.         0.24573525 0.         0.
  0.32311233 0.32311233 0.         0.32311233 0.         0.
  0.        ]
 [0.         0.         0.36325471 0.36325471 0.         0.
  0.36325471 0.36325471 0.36325471 0.27626457 0.         0.
  0.         0.         0.         0.         0.         0.36325471
  0.36325471]]


In [None]:
from nltk import ngrams 
from collections import Counter, defaultdict 
txt= "Natural languga eprocessing s fun and challenging." 
N=2 
token= txt.lower().split() 
n_grams=list(ngrams(token,N)) 
model=defaultdict(lambda:0) 
counts=Counter(n_grams) 
for ngram, count in counts.items(): 
    prefix=ngram[:-1] 
    total_prefix_counts=sum(c for ng,c in counts.items() 
                            if ng[:-1] == prefix
                           ) 
    model[ngram] =count/total_prefix_counts 
print("N-gram Probabilities:", dict(model)) 


In [None]:
from gensim.models import Word2Vec 
sent=[["natural","language","processing","is","fun"], 
      ["machine","learning","and","nlp","go","together"]] 
model= Word2Vec(sent, vector_size=50,window=3,min_count=1,sg=1) 
vector=model.wv["language"] 
print("word Vector for 'language':", vector) 

In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

corpus = [
    "I love this movie",
    "This movie is amazing",
    "I really enjoyed this film",
    "What a fantastic experience",
    "This movie is a masterpiece"
]


tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1


input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)


max_seq_length = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_length, padding='pre')
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)


model = Sequential([
    Embedding(total_words, 64, input_length=max_seq_length-1),
    LSTM(128, return_sequences=True),
    LSTM(64),
    Dense(64, activation='relu'),
    Dense(total_words, activation='softmax')
])


model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


model.fit(X, y, epochs=100, verbose=1)


def predict_next_words(model, tokenizer, text, n=1, temperature=1.0):
    for _ in range(n):
        sequence = tokenizer.texts_to_sequences([text])[0]
        sequence = pad_sequences([sequence], maxlen=max_seq_length-1, padding='pre')
        predictions = model.predict(sequence, verbose=0)
        
        
        predictions = np.log(predictions + 1e-7) / temperature
        probabilities = np.exp(predictions) / np.sum(np.exp(predictions))
        
        predicted_index = np.random.choice(range(total_words), p=probabilities[0])
        output_word = tokenizer.index_word.get(predicted_index, "")
        
       
        if output_word and (output_word not in text.split()[-2:]):
            text += " " + output_word
    return text

# Test the model
seed_text = "I really enjoyed "
next_words = 3
generated_text = predict_next_words(model, tokenizer, seed_text, next_words, temperature=0.8)
print(f"Input Text: {seed_text}")
print(f"Generated Text: {generated_text}")


In [None]:
 
import torch 
from torch.utils.data import Dataset, DataLoader 
from transformers import GPT2LMHeadModel, GPT2Tokenizer 
 
EPOCHS = 15 
BATCH_SIZE = 8 
LEARNING_RATE = 5e-6 
MAX_LENGTH = 75 
 
class TextDataset(Dataset): 
    def __init__(self, text, tokenizer, max_length): 
        self.input_ids = [] 
        self.attn_masks = [] 
        for line in text: 
            encodings_dict = tokenizer(line, truncation=True, max_length=max_length, padding="max_length") 
            self.input_ids.append(torch.tensor(encodings_dict['input_ids'])) 
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask'])) 
 
    def __len__(self): 
        return len(self.input_ids) 
 
    def __getitem__(self, idx): 
        return self.input_ids[idx], self.attn_masks[idx] 
 
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') 
if tokenizer.pad_token is None: 
    tokenizer.add_special_tokens({'pad_token': '[PAD]'}) 
 
model = GPT2LMHeadModel.from_pretrained('gpt2') 
model.resize_token_embeddings(len(tokenizer)) 
 
text_data = [ 
    "The quick brown fox jumps over the lazy dog.", 
    "The sun sets in the west and rises in the east.", 
    "Artificial Intelligence is transforming the world.", 
    "Deep learning models are revolutionizing various industries.", 
    "Natural Language Processing is a key area of artificial intelligence.", 
    "Machine learning models are data-driven and improve over time.", 
    "The future of technology lies in autonomous systems and robotics.", 
    "Cloud computing has become the backbone of modern infrastructure." 
] 
dataset = TextDataset(text_data, tokenizer, max_length=MAX_LENGTH) 
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True) 
 
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE) 
 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
model = model.to(device) 
model.train() 
 
for epoch in range(EPOCHS): 
    for batch in dataloader: 
        input_ids, attn_masks = [x.to(device) for x in batch] 
        optimizer.zero_grad() 
        outputs = model(input_ids, attention_mask=attn_masks, labels=input_ids) 
        loss = outputs.loss 
        loss.backward() 
        optimizer.step() 
    print(f"Epoch {epoch + 1}/{EPOCHS}, Loss: {loss.item()}") 
 
model.eval() 
prompt = "Artificial Intelligence" 
encoded_input = tokenizer(prompt, return_tensors='pt', padding=True).to(device) 
generated_ids = model.generate(encoded_input['input_ids'], max_length=MAX_LENGTH, 
num_return_sequences=1, pad_token_id=tokenizer.pad_token_id) 
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True) 
 
print("Generated Text:\n", generated_text)

In [None]:
import numpy as np 
import matplotlib.pyplot as plt 
import torch 
import torch.nn as nn 
 
# Parameters 
SEQ_LEN = 5 
D_MODEL = 4 
 
# Sample Sequence Dataset 
sequence = torch.tensor([[1.0, 0.0, 1.0, 0.0], 
                         [0.0, 2.0, 0.0, 1.0], 
                         [1.0, 1.0, 1.0, 1.0], 
                         [0.0, 0.0, 2.0, 1.0], 
                         [1.0, 2.0, 0.0, 0.0]]) 
 
# Self-Attention Components 
class SelfAttention(nn.Module): 
    def __init__(self, d_model): 
        super(SelfAttention, self).__init__() 
        self.query = nn.Linear(d_model, d_model) 
        self.key = nn.Linear(d_model, d_model) 
        self.value = nn.Linear(d_model, d_model) 
 
    def forward(self, x): 
        Q = self.query(x) 
        K = self.key(x) 
        V = self.value(x) 
 
        attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(K.size(-1)) 
        attention_weights = torch.softmax(attention_scores, dim=-1) 
        attention_output = torch.matmul(attention_weights, V) 
        return attention_output, attention_weights 
 
# Initialize Self-Attention 
self_attention = SelfAttention(D_MODEL) 
 
# Compute Attention Outputs and Weights 
attention_output, attention_weights = self_attention(sequence) 
 
# Visualize Attention Map 
plt.figure(figsize=(8, 6)) 
plt.imshow(attention_weights.detach().numpy(), cmap="viridis") 
plt.colorbar() 
plt.title("Attention Map") 
plt.xlabel("Key Positions") 
plt.ylabel("Query Positions") 
plt.show() 