In [None]:
!pip install docx


In [None]:
!pip install --upgrade python-docx
!pip install --upgrade exceptions


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import re
import heapq

# Load the dataset
df = pd.read_excel('Dataset1.xlsx')

# Extract content from the 'content' column
corpus = df['content'].tolist()

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(corpus)

# NMF for topic modeling
num_topics = 10
nmf_model = NMF(n_components=num_topics, random_state=42)
nmf_model.fit(tfidf)

# Get top topics
top_topics = []
for topic_idx, topic in enumerate(nmf_model.components_):
    top_keywords_idx = topic.argsort()[-5:][::-1]
    top_keywords = [tfidf_vectorizer.get_feature_names_out()[i] for i in top_keywords_idx]
    top_topics.append(' '.join(top_keywords))

# Display top 10 most discussed topics
print("Top 10 Most Discussed Topics:")
for i, topic in enumerate(top_topics, start=1):
    print(f"{i}. {topic}")

# Ask user for a keyword
keyword = input("Enter a keyword to find related content: ")

# Filter content based on the entered keyword
relevant_content = df[df['content'].str.contains(keyword)]['content'].tolist()

# Combine all relevant content into a single text
combined_content = "\n".join(relevant_content)

# Calculate TF-IDF scores for sentences
sentence_tfidf = tfidf_vectorizer.transform([combined_content]).toarray()

# Calculate cosine similarity between sentences and combined content
cosine_similarities = cosine_similarity(tfidf.toarray(), sentence_tfidf)

# Calculate average cosine similarity for each sentence
sentence_scores = np.mean(cosine_similarities, axis=1)

# Rank sentences by score
top_sentence_indices = heapq.nlargest(500, range(len(sentence_scores)), sentence_scores.take)

# Extract top sentences and create the summary
extractive_summary = '\n'.join([corpus[i] for i in top_sentence_indices])

# Clean the summary by removing extra spaces and newlines
cleaned_summary = re.sub(r'\s+', ' ', extractive_summary).strip()

# Split the summary into words
words = cleaned_summary.split(' ')

# Initialize variables
current_length = 0
limited_summary = []

# Limit the summary to less than 500 words
for word in words:
    if current_length + len(word) + 1 <= 500:
        limited_summary.append(word)
        current_length += len(word) + 1
    else:
        break

# Join the limited summary words back into a string
final_summary = ' '.join(limited_summary)

# Print the final summary
print(f"\nExtractive Summary for All Content Related to '{keyword}':\n{final_summary}")


Top 10 Most Discussed Topics:
1. ai microsoft cloud google data
2. crypto ftx assets bitcoin asset
3. reuters photo file editing thomson
4. bnpl credit consumers pay cfpb
5. bank cbdc digital central currency
6. law360 login case article 2farticles
7. company year said million billion
8. payments payment banking fintech digital
9. blockchain nfts nft metaverse web3
10. binance ftx exchange voyager zhao
Enter a keyword to find related content: google

Extractive Summary for All Content Related to 'google':
NEW YORK & SUNNYVALE, Calif.- Through a strategic expansion of their relationship, Accenture (NYSE: ACN) and Google Cloud will help organizations reinvent their businesses with generative AI to unlock new growth opportunities, supported by substantial new investments by Accenture. Today’s expansion builds on Accenture’s recently announced $3 billion investment in AI . Together, Accenture and Google Cloud will help organizations use generative AI to create new opportunities to drive ne

In [None]:
!pip install summarizer

In [None]:
!pip install bert-extractive-summarizer


In [None]:
!pip install transformers