In [None]:
import pandas as pd
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from concurrent.futures import ProcessPoolExecutor
import math
import nltk

nltk.download("punkt")
def summarize_chunk(chunk):
    summarizer = LexRankSummarizer()
    summaries = []
    for index, row in chunk.iterrows():
        article_content = row['content']
        parser = PlaintextParser.from_string(article_content, Tokenizer("english"))
        summary = summarizer(parser.document, sentences_count=2)
        summaries.append(str(summary))
    return summaries

excel_file_path = "dataset.xlsx"
df = pd.read_excel(excel_file_path)

num_cores = 4
chunk_size = math.ceil(len(df) / num_cores)

chunks = [df[i:i+chunk_size] for i in range(0, len(df), chunk_size)]

article_summaries = []

with ProcessPoolExecutor(max_workers=num_cores) as executor:
    chunk_summaries = executor.map(summarize_chunk, chunks)
    for summaries in chunk_summaries:
        article_summaries.extend(summaries)

df['summary'] = article_summaries

output_excel_file = "summaries_sumy1.xlsx"
df.to_excel(output_excel_file, index=False)

print("Summaries saved to", output_excel_file)

In [None]:
import pandas as pd
import spacy
from concurrent.futures import ProcessPoolExecutor
import math

def generate_summary(content):
    doc = nlp(content)
    sentences = [sent.text for sent in doc.sents]
    summary = " ".join(sentences[:2])
    return summary

def summarize_chunk(chunk):
    summaries = []
    for index, row in chunk.iterrows():
        article_content = row['content']
        summary = generate_summary(article_content)
        summaries.append(summary)
    return summaries

excel_file_path = "dataset.xlsx"
df = pd.read_excel(excel_file_path)


num_cores = 4
chunk_size = math.ceil(len(df) / num_cores)

chunks = [df[i:i+chunk_size] for i in range(0, len(df), chunk_size)]

article_summaries = []

nlp = spacy.load("en_core_web_sm")

with ProcessPoolExecutor(max_workers=num_cores) as executor:
    chunk_summaries = executor.map(summarize_chunk, chunks)
    for summaries in chunk_summaries:
        article_summaries.extend(summaries)

df['summary'] = article_summaries

output_excel_file = "summaries_spacy1.xlsx"
df.to_excel(output_excel_file, index=False)

print("Summaries saved to", output_excel_file)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

excel_file_path = 'summaries_spacy1.xlsx'

df = pd.read_excel(excel_file_path)

summary_column_name = 'summary'
snippet_column_name = 'snippet'

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df[snippet_column_name].fillna(''))

similarities = []

for index, row in df.iterrows():
    summary = row[summary_column_name]

    try:
        summary_vector = tfidf_vectorizer.transform([summary])
        snippet_vector = tfidf_matrix[index]
        similarity_score = cosine_similarity(summary_vector, snippet_vector)

        similarity_percentage = (similarity_score[0][0] + 1) * 50
        similarities.append(similarity_percentage)
    except Exception as e:
        print(f"Error calculating similarity for row {index + 1}: {e}")
        similarities.append(0)

df['similarity'] = similarities

updated_excel_file_path = 'Similarityspacy.xlsx'
df.to_excel(updated_excel_file_path, index=False)
print("Similarity percentages added and new Excel file created.")
