In [None]:
# 📌 Section 1: Install Required Packages
!pip install rake-nltk scikit-learn nltk pandas --quiet

In [None]:
# 📌 Section 2: Import Libraries
import nltk
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from rake_nltk import Rake

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# 📌 Section 3: Sample Text (abstract or paragraph)
sample_text = """
Artificial Intelligence (AI) is transforming industries by automating tasks, providing insights through data analysis,
and enabling smarter decision-making. Natural Language Processing (NLP), a subfield of AI, allows machines to understand
and interpret human language. Techniques like transformers and large language models (LLMs) have led to advancements in
language translation, text summarization, and question answering.
"""

In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# 📌 Section 4: RAKE-based Keyword Extraction
rake_extractor = Rake()
rake_extractor.extract_keywords_from_text(sample_text)
rake_keywords = rake_extractor.get_ranked_phrases_with_scores()

In [None]:
print("📌 RAKE Keywords (with scores):")
for score, phrase in rake_keywords:
    print(f"{score:.2f} - {phrase}")

📌 RAKE Keywords (with scores):
9.00 - techniques like transformers
9.00 - enabling smarter decision
8.75 - natural language processing
8.75 - large language models
8.75 - interpret human language
4.75 - language translation
4.00 - transforming industries
4.00 - text summarization
4.00 - question answering
4.00 - providing insights
4.00 - nlp ),
4.00 - data analysis
4.00 - automating tasks
4.00 - artificial intelligence
4.00 - allows machines
1.00 - understand
1.00 - subfield
1.00 - making
1.00 - llms
1.00 - led
1.00 - ai
1.00 - ai
1.00 - advancements


In [None]:
# 📌 Section 5: TF-IDF-based Keyword Extraction
def extract_tfidf_keywords(text, top_n=10):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform([text])
    tfidf_scores = zip(vectorizer.get_feature_names_out(), tfidf_matrix.toarray()[0])
    sorted_items = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
    return sorted_items[:top_n]

In [None]:
tfidf_keywords = extract_tfidf_keywords(sample_text)

In [None]:
print("\n📌 TF-IDF Top Keywords:")
for word, score in tfidf_keywords:
    print(f"{word}: {score:.4f}")


📌 TF-IDF Top Keywords:
language: 0.5345
ai: 0.2673
advancements: 0.1336
allows: 0.1336
analysis: 0.1336
answering: 0.1336
artificial: 0.1336
automating: 0.1336
data: 0.1336
decision: 0.1336
