In [None]:
#nlp preprocessing technniques
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize Spacy model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "Natural Language Processing (NLP) is a fascinating field of study. It's about making machines understand and generate language."

# 1. Tokenization
tokens = word_tokenize(text)
print("Tokens:", tokens)

# 2. Punctuation Removal
tokens_no_punct = [word for word in tokens if word not in string.punctuation]
print("Without Punctuation:", tokens_no_punct)

# 3. Stopwords Removal
stop_words = set(stopwords.words("english"))
without_stopwords = [word for word in tokens_no_punct if word.lower() not in stop_words]
print("Without Stopwords:", without_stopwords)

# 4. Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in without_stopwords]
print("Stemmed Tokens:", stemmed_tokens)

# 5. Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in without_stopwords]
print("Lemmatized Tokens (using NLTK):", lemmatized_tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Tokens: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'fascinating', 'field', 'of', 'study', '.', 'It', "'s", 'about', 'making', 'machines', 'understand', 'and', 'generate', 'language', '.']
Without Punctuation: ['Natural', 'Language', 'Processing', 'NLP', 'is', 'a', 'fascinating', 'field', 'of', 'study', 'It', "'s", 'about', 'making', 'machines', 'understand', 'and', 'generate', 'language']
Without Stopwords: ['Natural', 'Language', 'Processing', 'NLP', 'fascinating', 'field', 'study', "'s", 'making', 'machines', 'understand', 'generate', 'language']
Stemmed Tokens: ['natur', 'languag', 'process', 'nlp', 'fascin', 'field', 'studi', "'s", 'make', 'machin', 'understand', 'gener', 'languag']
Lemmatized Tokens (using NLTK): ['Natural', 'Language', 'Processing', 'NLP', 'fascinating', 'field', 'study', "'s", 'making', 'machine', 'understand', 'generate', 'language']


In [None]:
#Named Entity recognition
import spacy

# Load the pre-trained English model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "mumbai is the financial capital of india declared in 1930 which has GDP of 140 billion dollars."

# Process the text using the NLP model
doc = nlp(text)

# Extract named entities
for ent in doc.ents:
    print(ent.text, ent.label_)


mumbai GPE
india GPE
1930 DATE
140 billion dollars MONEY


In [None]:
#POS tagging in NLP
import nltk

# Download required resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Sample text
text = "The quick brown fox jumps over the lazy dog."

# Tokenize the text
tokens = nltk.word_tokenize(text)

# Perform POS tagging
pos_tags = nltk.pos_tag(tokens)

# Print the tokens with their POS tags
print(pos_tags)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.')]


In [None]:
#aspect mining
import spacy
from collections import Counter

# Load Spacy model
nlp = spacy.load("en_core_web_sm")

# Sample review text
text = """
The battery life of this phone is amazing. The screen quality is great but the camera could be better.
I love the design and the build quality, but the price is a bit high. The performance is fast and smooth.
"""

# Process the text with Spacy NLP pipeline
doc = nlp(text)

# Extract nouns as potential aspects
aspects = [token.text for token in doc if token.pos_ == "NOUN"]

# Print extracted aspects
print("Extracted Aspects:", aspects)

# Use Frequency Analysis to identify most common aspects
aspect_freq = Counter(aspects)
print("\nMost Common Aspects:", aspect_freq.most_common(5))


Extracted Aspects: ['battery', 'life', 'phone', 'screen', 'quality', 'camera', 'design', 'build', 'quality', 'price', 'bit', 'performance']

Most Common Aspects: [('quality', 2), ('battery', 1), ('life', 1), ('phone', 1), ('screen', 1)]


In [None]:
#text classification
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

# Sample data (more diverse examples for better training)
texts = [
    "I love this product!",    # Positive
    "This is the worst thing ever.", # Negative
    "It's okay, nothing special.",  # Neutral
    "Absolutely fantastic!",    # Positive
    "I hate it.",           # Negative
    "It's not bad, but could be better.", # Neutral
    "Really amazing, I'm very happy with it!",  # Positive
    "Disappointing, it did not work as expected.",  # Negative
    "Just average, nothing to write home about.",  # Neutral
    "Exceeded my expectations, very pleased!",  # Positive
    "Awful, I regret buying it.",  # Negative
    "Decent but could use some improvements."  # Neutral
]

# Labels: 1 for positive, -1 for negative, 0 for neutral
labels = [1, -1, 0, 1, -1, 0, 1, -1, 0, 1, -1, 0]

# Create a pipeline with TF-IDF vectorizer and Naive Bayes classifier
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Train the classifier
model.fit(texts, labels)

# Sample test text
sample_texts = ["This is amazing!", "Not what I expected.", "It's fine.", "Terrible experience, very disappointed.", "Best purchase ever!"]
predictions = model.predict(sample_texts)

# Map predictions to labels
sentiment_map = {1: "Positive", 0: "Neutral", -1: "Negative"}
for text, prediction in zip(sample_texts, predictions):
    print(f"Text: \"{text}\" => Sentiment: {sentiment_map[prediction]}")


Text: "This is amazing!" => Sentiment: Positive
Text: "Not what I expected." => Sentiment: Negative
Text: "It's fine." => Sentiment: Negative
Text: "Terrible experience, very disappointed." => Sentiment: Positive
Text: "Best purchase ever!" => Sentiment: Negative


In [None]:
#text summarization
from transformers import pipeline

# Load summarization pipeline
summarizer = pipeline("summarization")

# Sample long text
text = """
Artificial Intelligence (AI) is the simulation of human intelligence processes by machines, especially computer systems.
These processes include learning, reasoning, and self-correction. AI has numerous applications in today's society.
It is becoming increasingly important in various industries such as healthcare, finance, and automotive.
The future of AI promises to bring more innovation and opportunities.
"""

# Generate summary
summary = summarizer(text, max_length=50, min_length=25, do_sample=False)

# Print summary
print("Summary:", summary[0]['summary_text'])


No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]



Summary:  Artificial Intelligence (AI) is the simulation of human intelligence processes by machines . These processes include learning, reasoning, and self-correction . Future of AI promises to bring more innovation and opportunities .


In [None]:
#simple machine translation
from transformers import pipeline

# Load translation pipeline
translator = pipeline("translation_en_to_fr")

# Sample text for translation
text = "Hello, how are you? I hope you are having a great day!"

# Perform translation
translated_text = translator(text, max_length=40)

# Display the translated text
print("Original Text: ", text)
print("Translated Text: ", translated_text[0]['translation_text'])


No model was supplied, defaulted to google-t5/t5-base and revision 686f1db (https://huggingface.co/google-t5/t5-base).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Original Text:  Hello, how are you? I hope you are having a great day!
Translated Text:  Bonjour, et j'espère que vous avez une belle journée !


In [None]:
#sentiment analysis
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Preprocess function to clean the text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]  # Remove stopwords and non-alphanumeric tokens
    return ' '.join(filtered_tokens)

# Sample data (more diverse examples for better performance)
texts = [
    "I love this product!",    # Positive
    "This is the worst thing ever.", # Negative
    "It's okay, nothing special.",  # Neutral
    "Absolutely fantastic!",    # Positive
    "I hate it.",           # Negative
    "It's not bad, but could be better.", # Neutral
    "Really amazing, I'm very happy with it!",  # Positive
    "Disappointing, it did not work as expected.",  # Negative
    "Just average, nothing to write home about.",  # Neutral
    "Exceeded my expectations, very pleased!",  # Positive
    "Awful, I regret buying it.",  # Negative
    "Decent but could use some improvements."  # Neutral
]

# Preprocess the texts
preprocessed_texts = [preprocess_text(text) for text in texts]

# Labels: 1 for positive, 0 for neutral, -1 for negative
labels = [1, -1, 0, 1, -1, 0, 1, -1, 0, 1, -1, 0]

# Create a pipeline with TfidfVectorizer and Naive Bayes classifier
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(preprocessed_texts, labels, test_size=0.3, random_state=42)

# Train the classifier
model.fit(X_train, y_train)

# Sample test text
sample_texts = [
    "This is amazing!",
    "Not what I expected.",
    "It's fine.",
    "Terrible experience, very disappointed.",
    "Best purchase ever!",
    "Just okay, nothing great."
]

# Preprocess sample texts
preprocessed_sample_texts = [preprocess_text(text) for text in sample_texts]

# Predict the sentiment
predictions = model.predict(preprocessed_sample_texts)

# Map predictions to labels
sentiment_map = {1: "Positive", 0: "Neutral", -1: "Negative"}
for text, prediction in zip(sample_texts, predictions):
    print(f"Text: \"{text}\" => Sentiment: {sentiment_map[prediction]}")


Text: "This is amazing!" => Sentiment: Positive
Text: "Not what I expected." => Sentiment: Negative
Text: "It's fine." => Sentiment: Negative
Text: "Terrible experience, very disappointed." => Sentiment: Negative
Text: "Best purchase ever!" => Sentiment: Negative
Text: "Just okay, nothing great." => Sentiment: Neutral


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#tokenization using transformers
from transformers import pipeline

# Load the translation model using transformers
translator = pipeline("translation_en_to_fr")

# Sample text for translation
text = "Hello, how are you? I hope you are having a great day!"

# Perform translation
translated_text = translator(text, max_length=40)

# Display the translated text
print("Original Text: ", text)
print("Translated Text: ", translated_text[0]['translation_text'])

# Tokenization using transformers
tokenizer = pipeline("feature-extraction", model="bert-base-uncased")

# Tokenize the text
tokens = tokenizer(text)

# Display the tokenized output (the first token in the output is the embedding)
print("Tokenized Output (Embeddings):", tokens)


No model was supplied, defaulted to google-t5/t5-base and revision 686f1db (https://huggingface.co/google-t5/t5-base).
Using a pipeline without specifying a model name and revision in production is not recommended.


Original Text:  Hello, how are you? I hope you are having a great day!
Translated Text:  Bonjour, et j'espère que vous avez une belle journée !


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenized Output (Embeddings): [[[0.14625969529151917, 0.2167811393737793, -0.3121313154697418, -0.1467307209968567, -0.13097751140594482, -0.5414273738861084, 0.5017902255058289, 0.6041872501373291, 0.21057762205600739, -0.38864102959632874, -0.030973469838500023, -0.06591743975877762, 0.19713228940963745, 0.4300311803817749, 0.32914382219314575, 0.00723101943731308, -0.2427801638841629, 0.38532301783561707, 0.24713458120822906, -0.011194734834134579, -0.0707467794418335, -0.7028672099113464, 0.3101443350315094, 0.3598344922065735, 0.09096922725439072, -0.3395955264568329, -0.3934589922428131, -0.05383012443780899, 0.13591864705085754, -0.3366108238697052, -0.379660040140152, 0.4546765685081482, -0.42256247997283936, -0.20682120323181152, 0.283316969871521, 0.38387882709503174, -0.04929381608963013, -0.0635790005326271, -0.13120722770690918, 0.26020073890686035, -0.5932530760765076, -0.17153969407081604, 0.3439411222934723, 0.39798110723495483, -0.06533180177211761, -0.874273419380188