## 1. Basics of NLP:
   - Introduction to NLP and its applications
   - Key concepts:
     - Tokenization: Breaking text into smaller units (e.g., words, phrases)
     - Lemmatization: Reducing words to their base or root forms
     - Stemming: Reducing words to their stem or root form
     - Stop words: Common words (e.g., "the," "a") often removed during preprocessing
     - Part-of-speech tagging: Assigning grammatical categories to words (e.g., noun, verb)
     - Named entity recognition (NER): Identifying and classifying entities in text (e.g., names of people, organizations, locations)
     - Syntactic parsing: Analyzing the grammatical structure of sentences
     - Semantic analysis: Understanding the meaning of text


In [27]:
# Importing necessary libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import pos_tag, ne_chunk

In [10]:
# Downloading necessary resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-

True

In [35]:
# Sample text
text = "Natural language processing (NLP) is a subfield of artificial intelligence that focuses on the interaction between computers and humans through natural language."


In [36]:
# Tokenization
tokens = word_tokenize(text)
print("Tokens:", tokens)

Tokens: ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'subfield', 'of', 'artificial', 'intelligence', 'that', 'focuses', 'on', 'the', 'interaction', 'between', 'computers', 'and', 'humans', 'through', 'natural', 'language', '.']


In [37]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(token) for token in tokens]
print("Lemmas:", lemmas)

Lemmas: ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'subfield', 'of', 'artificial', 'intelligence', 'that', 'focus', 'on', 'the', 'interaction', 'between', 'computer', 'and', 'human', 'through', 'natural', 'language', '.']


In [38]:
# Stemming
stemmer = PorterStemmer()
stems = [stemmer.stem(token) for token in tokens]
print("Stems:", stems)

Stems: ['natur', 'languag', 'process', '(', 'nlp', ')', 'is', 'a', 'subfield', 'of', 'artifici', 'intellig', 'that', 'focus', 'on', 'the', 'interact', 'between', 'comput', 'and', 'human', 'through', 'natur', 'languag', '.']


In [39]:
# Stop word removal
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in tokens if word.lower() not in stop_words]
print("Filtered words:", filtered_words)

Filtered words: ['Natural', 'language', 'processing', '(', 'NLP', ')', 'subfield', 'artificial', 'intelligence', 'focuses', 'interaction', 'computers', 'humans', 'natural', 'language', '.']


In [45]:
# Part-of-speech tagging
pos_tags = pos_tag(tokens)
print("Part-of-speech:", pos_tags)

Part-of-speech: [('Natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('a', 'DT'), ('subfield', 'NN'), ('of', 'IN'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('that', 'WDT'), ('focuses', 'VBZ'), ('on', 'IN'), ('the', 'DT'), ('interaction', 'NN'), ('between', 'IN'), ('computers', 'NNS'), ('and', 'CC'), ('humans', 'NNS'), ('through', 'IN'), ('natural', 'JJ'), ('language', 'NN'), ('.', '.')]


In [46]:
# Named entity recognition
named_entities = ne_chunk(pos_tags)

# Output the named entities
for entity in named_entities:
    if hasattr(entity, 'label'):
        print(f"Entity: {' '.join(word for word, tag in entity)} - Label: {entity.label()}")


Entity: NLP - Label: ORGANIZATION


## 2. Text Preprocessing:
   - Cleaning and normalizing text data: Removing punctuation, numbers, and special characters; lowercasing text
   - Tokenization: Breaking text into tokens (e.g., words, phrases)
   - Stop word removal: Removing common words (e.g., "the," "a") that don't provide much meaning
   - Stemming and lemmatization: Reducing words to their base or root forms


In [50]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

In [51]:
# Download NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [58]:
# Example text
text = "I love to eat apples. They taste like candy. I bought 5 kg of apples yesterday."

In [53]:
def clean_and_normalize_text(text):
    # Removing Punctuation, Numbers, and Special Characters
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Lowercasing
    text = text.lower()

    # Removing URLs
    text = re.sub(r'http\S+', '', text)

    # Expanding Contractions (optional, requires additional data or packages)
    # text = expand_contractions(text)

    return text

# Clean and normalize text
cleaned_text = clean_and_normalize_text(text)

# Print the results
print("Original Text:", text)
print("Cleaned and Normalized Text:", cleaned_text)

Original Text: I love to eat apples. They taste like candy. I bought 5 kg of apples yesterday.
Cleaned and Normalized Text: i love to eat apples they taste like candy i bought  kg of apples yesterday


In [54]:
def tokenize_text(text):
    # Word Tokenization
    words = word_tokenize(text)

    # Sentence Tokenization (optional)
    # sentences = sent_tokenize(text)

    return words

# Tokenize text
tokens = tokenize_text(cleaned_text)

# Print the results
print("Tokens:", tokens)

Tokens: ['i', 'love', 'to', 'eat', 'apples', 'they', 'taste', 'like', 'candy', 'i', 'bought', 'kg', 'of', 'apples', 'yesterday']


In [55]:
def remove_stopwords(words):
    # Removing Stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]

    return filtered_words

# Remove stopwords
tokens_without_stopwords = remove_stopwords(tokens)

# Print the results
print("Tokens without Stopwords:", tokens_without_stopwords)

Tokens without Stopwords: ['love', 'eat', 'apples', 'taste', 'like', 'candy', 'bought', 'kg', 'apples', 'yesterday']


In [56]:
def stem_words(words):
    # Stemming
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in words]

    return stemmed_words

# Stem words
stemmed_tokens = stem_words(tokens_without_stopwords)

# Print the results
print("Stemmed Tokens:", stemmed_tokens)

Stemmed Tokens: ['love', 'eat', 'appl', 'tast', 'like', 'candi', 'bought', 'kg', 'appl', 'yesterday']


In [57]:

def lemmatize_words(words):
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]

    return lemmatized_words

# Lemmatize words
lemmatized_tokens = lemmatize_words(stemmed_tokens)

# Print the results
print("Lemmatized Tokens:", lemmatized_tokens)


Lemmatized Tokens: ['love', 'eat', 'appl', 'tast', 'like', 'candi', 'buy', 'kg', 'appl', 'yesterday']


In [59]:
import pandas as pd

# Sample dataset of text documents
data = {
    'id': [1, 2, 3],
    'text': [
        "I love to eat apples. They taste like candy. I bought 5 kg of apples yesterday.",
        "Python is an awesome programming language. It is widely used in data science.",
        "Today is a beautiful day. The weather is perfect for a picnic in the park."
    ]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Apply text preprocessing techniques to the 'text' column
df['cleaned_text'] = df['text'].apply(clean_and_normalize_text)
df['tokens'] = df['cleaned_text'].apply(tokenize_text)
df['tokens_without_stopwords'] = df['tokens'].apply(remove_stopwords)
df['stemmed_tokens'] = df['tokens_without_stopwords'].apply(stem_words)
df['lemmatized_tokens'] = df['stemmed_tokens'].apply(lemmatize_words)

# Print the DataFrame
print(df)


   id                                               text  \
0   1  I love to eat apples. They taste like candy. I...   
1   2  Python is an awesome programming language. It ...   
2   3  Today is a beautiful day. The weather is perfe...   

                                        cleaned_text  \
0  i love to eat apples they taste like candy i b...   
1  python is an awesome programming language it i...   
2  today is a beautiful day the weather is perfec...   

                                              tokens  \
0  [i, love, to, eat, apples, they, taste, like, ...   
1  [python, is, an, awesome, programming, languag...   
2  [today, is, a, beautiful, day, the, weather, i...   

                            tokens_without_stopwords  \
0  [love, eat, apples, taste, like, candy, bought...   
1  [python, awesome, programming, language, widel...   
2  [today, beautiful, day, weather, perfect, picn...   

                                      stemmed_tokens  \
0  [love, eat, appl, tast, l

In [60]:
df

Unnamed: 0,id,text,cleaned_text,tokens,tokens_without_stopwords,stemmed_tokens,lemmatized_tokens
0,1,I love to eat apples. They taste like candy. I...,i love to eat apples they taste like candy i b...,"[i, love, to, eat, apples, they, taste, like, ...","[love, eat, apples, taste, like, candy, bought...","[love, eat, appl, tast, like, candi, bought, k...","[love, eat, appl, tast, like, candi, buy, kg, ..."
1,2,Python is an awesome programming language. It ...,python is an awesome programming language it i...,"[python, is, an, awesome, programming, languag...","[python, awesome, programming, language, widel...","[python, awesom, program, languag, wide, use, ...","[python, awesom, program, languag, wide, use, ..."
2,3,Today is a beautiful day. The weather is perfe...,today is a beautiful day the weather is perfec...,"[today, is, a, beautiful, day, the, weather, i...","[today, beautiful, day, weather, perfect, picn...","[today, beauti, day, weather, perfect, picnic,...","[today, beauti, day, weather, perfect, picnic,..."


## 3. Text Representation:
   - Bag of Words (BoW) model: Representing text as a collection of word counts
   - TF-IDF (Term Frequency-Inverse Document Frequency): A numerical statistic that reflects the importance of a word in a document relative to a collection of documents
   - Word embeddings: Representing words in a continuous vector space (e.g., Word2Vec, GloVe, BERT)


In [61]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [62]:
# Tokenization
documents = ["I love cats", "I love dogs"]
tokens = [word_tokenize(doc.lower()) for doc in documents]

In [63]:
# Stopword Removal and Stemming/Lemmatization
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [64]:
cleaned_tokens = []
for doc_tokens in tokens:
    filtered_tokens = [ps.stem(token) for token in doc_tokens if token not in stop_words]
    cleaned_tokens.append(filtered_tokens)

In [67]:
# Bag of Words (BoW) model
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform([' '.join(doc) for doc in cleaned_tokens])
bow_features = vectorizer.get_feature_names_out()

print("Bag of Words (BoW) model:")
print(bow_features)
print(bow_matrix.toarray())

Bag of Words (BoW) model:
['cat' 'dog' 'love']
[[1 0 1]
 [0 1 1]]


In [68]:
# TF-IDF (Term Frequency-Inverse Document Frequency)
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([' '.join(doc) for doc in cleaned_tokens])
tfidf_features = tfidf_vectorizer.get_feature_names_out()

print("\nTF-IDF (Term Frequency-Inverse Document Frequency):")
print(tfidf_features)
print(tfidf_matrix.toarray())



TF-IDF (Term Frequency-Inverse Document Frequency):
['cat' 'dog' 'love']
[[0.81480247 0.         0.57973867]
 [0.         0.81480247 0.57973867]]


## 4. Named Entity Recognition (NER):
   - Identifying and classifying entities in text (e.g., names of people, organizations, locations)
   - NER libraries and tools (e.g., spaCy, Stanford NER)


In [69]:
# Step 1: Install NLTK
# If you haven't installed NLTK, you can install it using pip:
# pip install nltk

# Step 2: Import the necessary libraries
import nltk
from nltk import word_tokenize
from nltk import pos_tag
from nltk import ne_chunk

# Step 3: Download the necessary NLTK resources (if not already downloaded)
# You need to download the 'punkt' tokenizer and 'maxent_ne_chunker' for the Named Entity Recognition (NER) to work properly.
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Step 4: Define a sample text to be analyzed
text = "Apple is a technology company headquartered in Cupertino, California. It was founded in 1976 by Steve Jobs, Steve Wozniak, and Ronald Wayne."

# Step 5: Tokenize the text into words
words = word_tokenize(text)

# Step 6: Part-of-Speech (POS) tagging
# POS tagging is a step that assigns a part of speech to each token (word) in the text (e.g., noun, verb, adjective).
tagged_words = pos_tag(words)

# Step 7: Apply Named Entity Recognition (NER)
# NER is a step that identifies and classifies named entities (e.g., names of people, organizations, locations) in the text.
named_entities = ne_chunk(tagged_words)

# Step 8: Print the named entities
# This step prints the identified named entities along with their types.
for entity in named_entities:
    if isinstance(entity, nltk.Tree):
        print(f'Named Entity: {" ".join([x[0] for x in entity])}, Type: {entity.label()}')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


Named Entity: Apple, Type: GPE
Named Entity: Cupertino, Type: GPE
Named Entity: California, Type: GPE
Named Entity: Steve Jobs, Type: PERSON
Named Entity: Steve Wozniak, Type: PERSON
Named Entity: Ronald Wayne, Type: PERSON


## 5. Text Classification:
   - Supervised learning: Using labeled data to train models to classify text into predefined categories (e.g., sentiment analysis, spam detection)
   - Algorithms: Naive Bayes, Support Vector Machines (SVM), Random Forest, Gradient Boosting, Neural Networks


In [70]:
# Import necessary libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [71]:
# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [72]:
# Load and preprocess the data
data = [
    ('I am happy', 'positive'),
    ('I am sad', 'negative'),
    ('I feel great', 'positive'),
    ('I feel terrible', 'negative'),
    ('I am not happy', 'negative'),
    ('I am not sad', 'positive')
]

In [74]:
# Extract features (X) and labels (y)
X = [text for text, label in data]
y = [label for text, label in data]

In [77]:
# Tokenize, remove stopwords, and lemmatize each text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
X_processed = []

for text in X:
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
    X_processed.append(' '.join(tokens))

In [79]:
# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)


In [80]:
# Step 5: Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [81]:
# Step 6: Train a classifier (Multinomial Naive Bayes)
classifier = MultinomialNB()
classifier.fit(X_train_vectorized, y_train)

In [82]:
# Step 7: Evaluate the classifier
y_pred = classifier.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')


Accuracy: 0.0
Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       1.0
    positive       0.00      0.00      0.00       1.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



## 6. Sentiment Analysis:
   - Analyzing text to determine sentiment (positive, negative, neutral)
   - Sentiment lexicons and dictionaries (e.g., VADER, SentiWordNet)


In [84]:
import nltk
nltk.download('vader_lexicon')  # Download the VADER lexicon for sentiment analysis
nltk.download('movie_reviews')  # Download the movie reviews dataset for training a classifier (optional)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [85]:
# Sentiment Analysis
from nltk.sentiment import SentimentIntensityAnalyzer

# Create a SentimentIntensityAnalyzer object
sia = SentimentIntensityAnalyzer()

# Define a sample text
text = "I love NLTK! It's the best library ever!"

# Perform sentiment analysis
sentiment = sia.polarity_scores(text)

# Print the sentiment scores
print(sentiment)


{'neg': 0.0, 'neu': 0.358, 'pos': 0.642, 'compound': 0.8745}


In [86]:
# Custom Sentiment Analysis
from nltk.sentiment import SentimentIntensityAnalyzer

# Create a SentimentIntensityAnalyzer object
sia = SentimentIntensityAnalyzer()

# Define a sample text
text = "I love NLTK! It's the best library ever!"

# Perform sentiment analysis
sentiment = sia.polarity_scores(text)

# Print the sentiment scores
print(sentiment)


{'neg': 0.0, 'neu': 0.358, 'pos': 0.642, 'compound': 0.8745}


In [87]:
# Sentiment Analysis with Custom Text
from nltk.sentiment import SentimentIntensityAnalyzer

# Create a SentimentIntensityAnalyzer object
sia = SentimentIntensityAnalyzer()

# Define a sample text
text = "I love NLTK! It's the best library ever!"

# Perform sentiment analysis
sentiment = sia.polarity_scores(text)

# Print the sentiment scores
print(sentiment)


{'neg': 0.0, 'neu': 0.358, 'pos': 0.642, 'compound': 0.8745}


In [88]:
# Sentiment Analysis on Multiple Texts
from nltk.sentiment import SentimentIntensityAnalyzer

# Create a SentimentIntensityAnalyzer object
sia = SentimentIntensityAnalyzer()

# Define a list of sample texts
texts = [
    "I love NLTK! It's the best library ever!",
    "NLTK is amazing!",
    "I hate NLTK. It's terrible."
]

# Perform sentiment analysis on each text
for text in texts:
    sentiment = sia.polarity_scores(text)
    print(sentiment)



{'neg': 0.0, 'neu': 0.358, 'pos': 0.642, 'compound': 0.8745}
{'neg': 0.0, 'neu': 0.328, 'pos': 0.672, 'compound': 0.6239}
{'neg': 0.773, 'neu': 0.227, 'pos': 0.0, 'compound': -0.7783}


In [89]:
# Custom Sentiment Analysis with NLTK: using movie reviews dataset

import nltk
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy

# Get the movie reviews dataset
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Define the featureset (bag of words)
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]

# Train a Naive Bayes classifier
classifier = NaiveBayesClassifier.train(train_set)

# Evaluate the classifier
print('Accuracy:', accuracy(classifier, test_set))


Accuracy: 0.86


## 7. Language Models and Text Generation:
   - Pretrained language models (e.g., GPT, BERT, T5)
   - Text generation using language models

In [99]:
# Load movie reviews and create labeled data
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]


In [101]:
# Step 1: Import necessary libraries
import nltk
from nltk.corpus import gutenberg
from nltk.probability import FreqDist
from nltk.util import ngrams
import random

In [102]:
# Step 2: Load the text data
nltk.download('gutenberg')
moby_dick = gutenberg.raw('melville-moby_dick.txt')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [103]:
# Step 3: Preprocess the text
moby_dick = moby_dick.lower()  # Convert to lowercase

In [104]:
# Tokenize the text into words
words = nltk.word_tokenize(moby_dick)

In [105]:
# Step 4: Create the Language Model
# Unigram Model
unigrams = ngrams(words, 1)
unigram_model = FreqDist(unigrams)

In [106]:
# Bigram Model
bigrams = ngrams(words, 2)
bigram_model = nltk.ConditionalFreqDist((prev_word, word) for prev_word, word in bigrams)


In [119]:

# Step 5: Generate Text
def generate_text(model, num_words=100):
    text = []
    prev_word = None
    for _ in range(num_words):
        if prev_word:
            next_word = model[prev_word].max()  # Choose the most frequent next word
        else:
            next_word = random.choice(list(model.keys()))  # Choose a random starting word
        text.append(next_word)
        prev_word = next_word
    return ' '.join(text)

# Generate text using the unigram model
print(generate_text(unigram_model))

# Generate text using the bigram model
print(generate_text(bigram_model))

AttributeError: 'int' object has no attribute 'max'

## 8. Machine Translation:
   - Translating text from one language to another
   - NLP libraries for machine translation (e.g., Google Translate API)


In [122]:
from nltk.translate import IBMModel1
from nltk.tokenize import word_tokenize
from nltk.translate.api import AlignedSent
from nltk.translate.api import Alignment

english_sentence = "I am learning natural language processing."

# Tokenize the English sentence
english_tokens = word_tokenize(english_sentence)

# Create an AlignedSent instance with English and Bengali sentences
aligned_sent = AlignedSent(english_tokens, [])

# Create a list of AlignedSent instances
aligned_corpus = [aligned_sent]

# Create an IBMModel1 instance
ibm_model1 = IBMModel1(aligned_corpus, 10)

# Translate each English word to Bengali
for english_word in english_tokens:
    # Get the translation probabilities for the English word
    probs = ibm_model1.translation_table[english_word]
    
    # Find the Bengali word with the highest probability
    max_prob_word = max(probs, key=probs.get)
    
    # Print the translation
    print(f"{english_word} -> {max_prob_word}")


I -> None
am -> None
learning -> None
natural -> None
language -> None
processing -> None
. -> None


In [125]:
from nltk.translate import IBMModel1
from nltk.tokenize import word_tokenize
from nltk.translate.api import AlignedSent

# Sample sentence-aligned corpus
english_corpus = [
    "I am learning natural language processing.",
    "He likes to play football.",
    "She is a doctor."
]
bengali_corpus = [
    "আমি ন্যাচুরাল ভাষা প্রসেসিং শেখছি।",
    "তার পছন্দ ফুটবল খেলা।",
    "তিনি ডাক্তার।"
]

# Tokenize the English sentences
english_tokenized_corpus = [word_tokenize(sentence) for sentence in english_corpus]

# Tokenize the Bengali sentences
bengali_tokenized_corpus = [word_tokenize(sentence) for sentence in bengali_corpus]

# Create AlignedSent instances for each sentence pair
aligned_corpus = [AlignedSent(english_tokens, bengali_tokens) for english_tokens, bengali_tokens in zip(english_tokenized_corpus, bengali_tokenized_corpus)]

# Train an IBMModel1 instance using the aligned corpus
ibm_model1 = IBMModel1(aligned_corpus, 10)

# Translate each English word to Bengali
for english_sentence, bengali_sentence in zip(english_tokenized_corpus, bengali_tokenized_corpus):
    # Find the Bengali translation for each word in the English sentence
    bengali_translation = [ibm_model1.translate(word) for word in english_sentence]
    
    # Print the English sentence and its Bengali translation
    print(f"English: {' '.join(english_sentence)}")
    print(f"Bengali: {' '.join(bengali_translation)}\n")


AttributeError: 'IBMModel1' object has no attribute 'translate'

## 9. Question Answering Systems:
   - Building systems that can answer questions posed in natural language
   - Datasets and benchmarks (e.g., SQuAD)