# **`Necessary installations and library imports.`**

In [None]:
pip install umap-learn

In [None]:
pip install hdbscan

In [None]:
!pip install -U sentence-transformers

In [None]:
pip install --upgrade openai

In [None]:
# @title Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from gensim import corpora, models
from sklearn.decomposition import PCA
from wordcloud import WordCloud
from PIL import Image
from gensim.models import HdpModel
from gensim.models.coherencemodel import CoherenceModel
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re
import plotly.graph_objects as go
from collections import defaultdict
import umap
import hdbscan
import openai
from openai import OpenAI
from sentence_transformers import SentenceTransformer,util
import os
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


# **`Dataset Preparation`**

In [None]:
# @title Read the original DataFrame
old_df=pd.read_csv('your_path') #Replace with the actual path

In [None]:
# @title Extract the AI category
new_df=old_df[old_df['categories'].str.contains('cs.AI')]
new_df=new_df.reset_index(drop=True)

In [None]:
# @title Replacing the old year (date) column with year and month
year=new_df['year'].str.slice(0,4).tolist()
month=new_df['year'].str.slice(5,7).tolist()
new_df.drop(columns=['year'])
new_df['year']=[int(x) for x in year]
new_df['month']=[int(x) for x in month]

In [None]:
# @title Extract papers from 2016 to 2023
df=new_df[new_df['year']>=2016]
df=df.reset_index(drop=True)

In [None]:
# @title Sort the DataFrame wrt the year
df_sorted = df.sort_values(by='year',ascending=False)
df_sorted=df_sorted.reset_index(drop=True)

# ***`Enter the Query`***

In [None]:
query=input("Enter the query specifying the scientific field: ")

# **`Preprocessing step`**

In [None]:
# @title Define a function for preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_text_updated(text):
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords and punctuation, and lemmatize words
    filtered_tokens = []
    for token in tokens:
        # Remove punctuation
        token = token.translate(str.maketrans('', '', string.punctuation))
        # Apply lemmatization
        token_lemm = lemmatizer.lemmatize(token)
        # Remove stopwords and short words
        if (token_lemm.lower() not in stop_words and token.lower() not in stop_words) and len(token_lemm)>=2:
            filtered_tokens.append(token_lemm)

    # Join the filtered tokens back into a sentence
    preprocessed_text = ' '.join(filtered_tokens)

    return preprocessed_text

In [None]:
# @title Aplly the preprocessing function on the abstracts
processed_abstracts=[preprocess_text_updated(abstract) for abstract in df_sorted['abstract'].tolist()]

# **`The SBERT Embeddings step`**

In [None]:
# @title Define the SBERT model
model = SentenceTransformer("allenai-specter") # You can replace the model with any other model.

In [None]:
# @title Apply the model on the processed abstracts and the query
processed_abstracts_embeddings=model.encode(processed_abstracts)
query_embeddings=model.encode(query)

# **`Relevant Abstracts Retrieval step`**

In [None]:
# @title Computing the Similarity between the Embeddings of the abstracts and the query
similarity=np.array(util.cos_sim(query_embeddings,processed_abstracts_embeddings))

In [None]:
# @title Retrieve abstracts with similarity exceeding the threshold
threshold=0.75  # Set the threshold you prefer
selected_indices = np.where(similarity > threshold)[1]
selected_processed_abstracts=[processed_abstracts[i] for i in selected_indices]
selected_processed_abstracts_embeddings=[processed_abstracts_embeddings[i] for i in selected_indices]

# **`Perform Clustering Step`**

In [None]:
# @title Apply UMAP dimensionality reduction
n_component=50  #The new dimesnionality value
fit=umap.UMAP(n_neighbors=200,min_dist=0.0,n_components=n_component,metric='cosine')  # Select the paramaters you want
selected_processed_abstracts_umap_embeddings=fit.fit_transform(selected_processed_abstracts_embeddings)

In [None]:
# @title Apply the clustering algorithm
clusterer = hdbscan.HDBSCAN(min_cluster_size=130,min_samples=3,gen_min_span_tree=True) #Select the paramaters you want
clusters=clusterer.fit(selected_processed_abstracts_umap_embeddings)
cluster_labels=clusters.labels_

# To view the clusters distribution
unique_clusters, counts = np.unique(cluster_labels, return_counts=True)
for cluster_id, count in zip(unique_clusters, counts):
    if(cluster_id==-1):
      print("Cluster:", cluster_id, " (Noise)", ", Number of embeddings:", count)
      continue
    print("Cluster:", cluster_id, ", Number of embeddings:", count)

# **`Topic Modeling Step`**

In [None]:
# @title Getting the optimum number of topics for each cluster
num_of_topics=[]
for cluster in range(len(unique_clusters)-1):
  cluster_indices=np.array(np.where(cluster_labels==cluster))
  abstracts_per_cluster=[selected_processed_abstracts[i] for i in cluster_indices[0]]
  tokenized_abstracts_per_cluster=[a.split() for a in abstracts_per_cluster]
  dictionary = corpora.Dictionary(tokenized_abstracts_per_cluster)
  corpus = [dictionary.doc2bow(tokens) for tokens in tokenized_abstracts_per_cluster]
  topic_number_range=[5,10]
  coherence_scores={}
  number_of_trials_per_topic_number=20
  for trial in range(number_of_trials_per_topic_number):
    for num_top in topic_number_range:
      lda_model = models.LdaModel(corpus, num_topics=num_top, id2word=dictionary,passes=10)
      coherence_model = CoherenceModel(model=lda_model, texts=tokenized_abstracts_per_cluster, dictionary=dictionary, coherence='c_v')
      coherence_score = coherence_model.get_coherence()
      coherence_scores.setdefault(num_top, []).append(coherence_score)
  average_coherence_scores = {num_topics: np.mean(scores) for num_topics, scores in coherence_scores.items()}
  optimal_num_topics = max(average_coherence_scores, key=average_coherence_scores.get)
  num_of_topics.append(optimal_num_topics)

num_of_topics=np.array(num_of_topics)

In [None]:
# @title Apply LDA with the obtained optimum number of topics per each cluster
lda_models=[]
for cluster in range(len(unique_clusters)-1):
  indices=np.array(np.where(cluster_labels==cluster))
  cluster_indices=np.array(np.where(cluster_labels==cluster))
  abstracts_per_cluster=[selected_processed_abstracts[i] for i in cluster_indices[0]]
  tokenized_abstracts_per_cluster=[a.split() for a in abstracts_per_cluster]
  dictionary = corpora.Dictionary(tokenized_abstracts_per_cluster)
  corpus = [dictionary.doc2bow(tokens) for tokens in tokenized_abstracts_per_cluster]
  lda_model = models.LdaModel(corpus, num_topics=num_of_topics[cluster], id2word=dictionary,passes=10)
  lda_models.append(lda_model)

  #To view the topics distribution
  print("For Cluster ",cluster,' :','\n')
  for topic_id, topic in lda_model.print_topics():
      print(f"Topic {topic_id}: {topic}")
      print('\n')
  print('------------------------------------------')

# **`Topic Labeling step`**

In [None]:
# The labeling was done using Microsoft's Copilot Generative AI, and the labels were save in a csv file
labeled_topics=pd.read_csv('your_path') # Replace with the actual path

# **`Analyzing the topics trending bahavior`**

In [None]:
# @title Calculating the the retrieved abstracts distribution across years and quarters
filtered_df_query = df_sorted.iloc[selected_indices].copy()
filtered_df_query=filtered_df_query.reset_index(drop=True)

#Calculate per year quarter
quarters_map = {
    1: 'Q1', 2: 'Q1', 3: 'Q1',
    4: 'Q2', 5: 'Q2', 6: 'Q2',
    7: 'Q3', 8: 'Q3', 9: 'Q3',
    10: 'Q4', 11: 'Q4', 12: 'Q4'
}
filtered_df_query['Quarter'] = filtered_df_query['month'].map(quarters_map)
year_quarter_query_counts = filtered_df_query.groupby(['year', 'Quarter']).size().reset_index(name='Count')
year_quarter_query_counts.sort_values(by=['year', 'Quarter'], inplace=True)

#Calculate per year
temp=year_quarter_query_counts.groupby('year')['Count'].sum()
year_query_counts=pd.DataFrame(temp).reset_index()
year_query_counts.columns = ['Year', 'Count']

In [None]:
# @title Help functions

#-----------create an empty df with the desired year-quarter distribution---------------------------------
def create_df():
  years = []
  quarters = []
  counts = []

  # Generate data for years from 2016 to 2023
  for year in range(2016, 2024):
    # Generate quarters for each year (4 quarters for all years except 2023)
    if year < 2023:
          for quarter in ['Q1', 'Q2', 'Q3', 'Q4']:
              years.append(year)
              quarters.append(quarter)
              counts.append(0)
    else:
          for quarter in ['Q1', 'Q2']:
              years.append(year)
              quarters.append(quarter)
              counts.append(0)

  # Create DataFrame
  data = {'year': years, 'Quarter': quarters, 'Count': counts}
  df = pd.DataFrame(data)
  return df


#-----------Function to retrieve the indix of an item in a list---------------------------------
def indices_where_equal(val, list):
    for i,el in enumerate(list):
      if(el==val):
        return i
    return -1

In [None]:
# @title Method 1
for cluster in range(len(unique_clusters)-1):
  df_cluster=labeled_topics[labeled_topics['Cluster']==cluster]
  df_cluster=df_cluster.reset_index(drop=True)
  dominant_topic_document_indices = defaultdict(list)
  cluster_indices=np.array(np.where(cluster_labels==cluster))
  abstracts_per_cluster=[selected_processed_abstracts[i] for i in cluster_indices[0]]
  tokenized_abstracts_per_cluster=[a.split() for a in abstracts_per_cluster]
  dictionary = corpora.Dictionary(tokenized_abstracts_per_cluster)
  for i,doc in enumerate(tokenized_abstracts_per_cluster):
    # Convert the document to bag of words format
    doc_bow = dictionary.doc2bow(doc)
    # Get the topic distribution for the document
    topic_distribution = lda_models[cluster].get_document_topics(doc_bow)
    # Extract the dominant topic and increment the count for that topic
    dominant_topic = max(topic_distribution, key=lambda x: x[1])[0]
    dominant_topic_document_indices[dominant_topic].append(i)

  fig = go.Figure()
  for top_ind in range(len(df_cluster)):
    merged_topics=df_cluster['Merged Topics'][top_ind].split(',')
    merged_topics=[int(num) for num in merged_topics]
    cumelative_df=create_df()

    for ind,top in enumerate(merged_topics):
      abstracts_per_topic=[abstracts_per_cluster[j] for j in dominant_topic_document_indices[top]]
      abstracts_per_topic_indices_in_df=[indices_where_equal(abst, selected_processed_abstracts) for abst in abstracts_per_topic]
      new_filtered_df = filtered_df_query.iloc[abstracts_per_topic_indices_in_df].copy()
      quarter_counts = new_filtered_df.groupby(['year', 'Quarter']).size().reset_index(name='Count')
      cumelative_df_indexed = cumelative_df.set_index(['year', 'Quarter'])
      quarter_counts_indexed = quarter_counts.set_index(['year', 'Quarter'])
      result = cumelative_df_indexed.add(quarter_counts_indexed, fill_value=0)
      result['Count']=result['Count'].astype(int)
      result.reset_index(inplace=True)
      cumelative_df=result.copy()

    temp=cumelative_df.groupby('year')['Count'].sum()
    cumelative_df=pd.DataFrame(temp).reset_index()
    cumelative_df.columns = ['year', 'Count']
    label="Topic "+str(top_ind)+": "+df_cluster['Topic Label'][top_ind]
    fig.add_trace(go.Scatter(x=cumelative_df['year'].astype(str),
                              y=cumelative_df['Count'],
                              mode='lines',
                              name=label,
                              visible='legendonly'  # Set visibility to 'legendonly' by default
                              ))

  fig.update_layout(
      title='Papers per year for Cluster '+str(cluster),
      xaxis_title='Year',
      yaxis_title='Paper Count',
      xaxis=dict(tickangle=-45),
      legend=dict(traceorder='normal')
  )

  fig.show()

In [None]:
# @title Method 2
for cluster in range(len(unique_clusters)-1):
  df_cluster=labeled_topics[labeled_topics['Cluster']==cluster]
  df_cluster=df_cluster.reset_index(drop=True)
  dominant_topic_document_indices = defaultdict(list)
  cluster_indices=np.array(np.where(cluster_labels==cluster))
  abstracts_per_cluster=[selected_processed_abstracts[i] for i in cluster_indices[0]]
  tokenized_abstracts_per_cluster=[a.split() for a in abstracts_per_cluster]
  dictionary = corpora.Dictionary(tokenized_abstracts_per_cluster)
  for i,doc in enumerate(tokenized_abstracts_per_cluster):
    # Convert the document to bag of words format
    doc_bow = dictionary.doc2bow(doc)
    # Get the topic distribution for the document
    topic_distribution = lda_models[cluster].get_document_topics(doc_bow)
    # Extract the dominant topic and increment the count for that topic
    dominant_topic = max(topic_distribution, key=lambda x: x[1])[0]
    dominant_topic_document_indices[dominant_topic].append(i)

  fig = go.Figure()
  for top_ind in range(len(df_cluster)):
    merged_topics=df_cluster['Merged Topics'][top_ind].split(',')
    merged_topics=[int(num) for num in merged_topics]
    cumelative_df=create_df()

    for ind,top in enumerate(merged_topics):
      abstracts_per_topic=[abstracts_per_cluster[j] for j in dominant_topic_document_indices[top]]
      abstracts_per_topic_indices_in_df=[indices_where_equal(abst, selected_processed_abstracts) for abst in abstracts_per_topic]
      new_filtered_df = filtered_df_query.iloc[abstracts_per_topic_indices_in_df].copy()
      quarter_counts = new_filtered_df.groupby(['year', 'Quarter']).size().reset_index(name='Count')
      cumelative_df_indexed = cumelative_df.set_index(['year', 'Quarter'])
      quarter_counts_indexed = quarter_counts.set_index(['year', 'Quarter'])
      result = cumelative_df_indexed.add(quarter_counts_indexed, fill_value=0)
      result['Count']=result['Count'].astype(int)
      result.reset_index(inplace=True)
      cumelative_df=result.copy()

    temp=cumelative_df.groupby('year')['Count'].sum()
    cumelative_df=pd.DataFrame(temp).reset_index()
    cumelative_df.columns = ['year', 'Count']
    cumelative_df['Normalized_Count'] = cumelative_df['Count'] / year_query_counts['Count']
    label="Topic "+str(top_ind)+": "+df_cluster['Topic Label'][top_ind]
    fig.add_trace(go.Scatter(x=cumelative_df['year'].astype(str),
                              y=cumelative_df['Normalized_Count'],
                              mode='lines',
                              name=label,
                              visible='legendonly'  # Set visibility to 'legendonly' by default
                              ))

  fig.update_layout(
      title='Topic papers to Query papers per year ratio for Cluster  '+str(cluster),
      xaxis_title='Year',
      yaxis_title='Ratio',
      xaxis=dict(tickangle=-45),
      legend=dict(traceorder='normal')
  )

  fig.show()

In [None]:
# @title Method 3
for cluster in range(len(unique_clusters)-1):
  df_cluster=labeled_topics[labeled_topics['Cluster']==cluster]
  df_cluster=df_cluster.reset_index(drop=True)
  dominant_topic_document_indices = defaultdict(list)
  cluster_indices=np.array(np.where(cluster_labels==cluster))
  abstracts_per_cluster=[selected_processed_abstracts[i] for i in cluster_indices[0]]
  tokenized_abstracts_per_cluster=[a.split() for a in abstracts_per_cluster]
  dictionary = corpora.Dictionary(tokenized_abstracts_per_cluster)
  for i,doc in enumerate(tokenized_abstracts_per_cluster):
    # Convert the document to bag of words format
    doc_bow = dictionary.doc2bow(doc)
    # Get the topic distribution for the document
    topic_distribution = lda_models[cluster].get_document_topics(doc_bow)
    # Extract the dominant topic and increment the count for that topic
    dominant_topic = max(topic_distribution, key=lambda x: x[1])[0]
    dominant_topic_document_indices[dominant_topic].append(i)

  fig = go.Figure()
  for top_ind in range(len(df_cluster)):
    merged_topics=df_cluster['Merged Topics'][top_ind].split(',')
    merged_topics=[int(num) for num in merged_topics]
    cumelative_df=create_df()

    for ind,top in enumerate(merged_topics):
      abstracts_per_topic=[abstracts_per_cluster[j] for j in dominant_topic_document_indices[top]]
      abstracts_per_topic_indices_in_df=[indices_where_equal(abst, selected_processed_abstracts) for abst in abstracts_per_topic]
      new_filtered_df = filtered_df_query.iloc[abstracts_per_topic_indices_in_df].copy()
      quarter_counts = new_filtered_df.groupby(['year', 'Quarter']).size().reset_index(name='Count')
      cumelative_df_indexed = cumelative_df.set_index(['year', 'Quarter'])
      quarter_counts_indexed = quarter_counts.set_index(['year', 'Quarter'])
      result = cumelative_df_indexed.add(quarter_counts_indexed, fill_value=0)
      result['Count']=result['Count'].astype(int)
      result.reset_index(inplace=True)
      cumelative_df=result.copy()

    temp=cumelative_df.groupby('year')['Count'].sum()
    cumelative_df=pd.DataFrame(temp).reset_index()
    cumelative_df.columns = ['year', 'Count']
    cumelative_df['Normalized_Count'] = cumelative_df['Count'] / year_query_counts['Count']
    window_size = 5  # Set the value you want
    cumelative_df['Moving_Average'] = cumelative_df['Normalized_Count'].rolling(window=window_size, min_periods=1).mean()
    label="Topic "+str(top_ind)+": "+df_cluster['Topic Label'][top_ind]
    fig.add_trace(go.Scatter(x=cumelative_df['year'].astype(str),
                              y=cumelative_df['Moving_Average'],
                              mode='lines',
                              name=label,
                              visible='legendonly'  # Set visibility to 'legendonly' by default
                              ))

  fig.update_layout(
      title='Topic papers to Query papers per year ratio using moving average of '+str(window_size) +' for Cluster '  +str(cluster),
      xaxis_title='Year',
      yaxis_title='the moving average ratio',
      xaxis=dict(tickangle=-45),
      legend=dict(traceorder='normal')
  )

  fig.show()

In [None]:
# @title Method 4
for cluster in range(len(unique_clusters)-1):
  df_cluster=labeled_topics[labeled_topics['Cluster']==cluster]
  df_cluster=df_cluster.reset_index(drop=True)
  dominant_topic_document_indices = defaultdict(list)
  cluster_indices=np.array(np.where(cluster_labels==cluster))
  abstracts_per_cluster=[selected_processed_abstracts[i] for i in cluster_indices[0]]
  tokenized_abstracts_per_cluster=[a.split() for a in abstracts_per_cluster]
  dictionary = corpora.Dictionary(tokenized_abstracts_per_cluster)
  for i,doc in enumerate(tokenized_abstracts_per_cluster):
    # Convert the document to bag of words format
    doc_bow = dictionary.doc2bow(doc)
    # Get the topic distribution for the document
    topic_distribution = lda_models[cluster].get_document_topics(doc_bow)
    # Extract the dominant topic and increment the count for that topic
    dominant_topic = max(topic_distribution, key=lambda x: x[1])[0]
    dominant_topic_document_indices[dominant_topic].append(i)

  fig = go.Figure()
  for top_ind in range(len(df_cluster)):
    merged_topics=df_cluster['Merged Topics'][top_ind].split(',')
    merged_topics=[int(num) for num in merged_topics]
    cumelative_df=create_df()

    for ind,top in enumerate(merged_topics):
      abstracts_per_topic=[abstracts_per_cluster[j] for j in dominant_topic_document_indices[top]]
      abstracts_per_topic_indices_in_df=[indices_where_equal(abst, selected_processed_abstracts) for abst in abstracts_per_topic]
      new_filtered_df = filtered_df_query.iloc[abstracts_per_topic_indices_in_df].copy()
      quarter_counts = new_filtered_df.groupby(['year', 'Quarter']).size().reset_index(name='Count')
      cumelative_df_indexed = cumelative_df.set_index(['year', 'Quarter'])
      quarter_counts_indexed = quarter_counts.set_index(['year', 'Quarter'])
      result = cumelative_df_indexed.add(quarter_counts_indexed, fill_value=0)
      result['Count']=result['Count'].astype(int)
      result.reset_index(inplace=True)
      cumelative_df=result.copy()

    cumelative_df['Normalized_Count'] = cumelative_df['Count'] / year_quarter_query_counts['Count']
    window_size = 20  # Set the value you want
    cumelative_df['Moving_Average'] = cumelative_df['Normalized_Count'].rolling(window=window_size, min_periods=1).mean()
    label="Topic "+str(top_ind)+": "+df_cluster['Topic Label'][top_ind]
    fig.add_trace(go.Scatter(x=cumelative_df['year'].astype(str)+ '-' + cumelative_df['Quarter'],
                              y=cumelative_df['Moving_Average'],
                              mode='lines',
                              name=label,
                              visible='legendonly'  # Set visibility to 'legendonly' by default
                              ))

  fig.update_layout(
      title='Topic papers to Query papers per year ratio using moving average of '+str(window_size) +' for Cluster '  +str(cluster),
      xaxis_title='Year-Quarter',
      yaxis_title='the moving average ratio',
      xaxis=dict(tickangle=-45),
      legend=dict(traceorder='normal')
  )

  fig.show()

# **`Summarization Step`**

In [None]:
# @title Help Functions
def generate_summary(prompt,key):
  client = OpenAI(
    # This is the default and can be omitted
    api_key=key,
)
  chat_completion = client.chat.completions.create(
      messages=[
          {
              "role": "user",
              "content": prompt,
          }
      ],
      model="gpt-3.5-turbo-0125",
  )
  return chat_completion.choices[0].message.content

def get_topic(abstracts_per_cluster,abstracts_per_cluster_emb,topic_num,cluster,lda_models):
  abstracts_per_topic=[]
  abstracts_emb_per_topic=[]
  tokenized_abstracts_per_cluster = [a.split() for a in abstracts_per_cluster]
  dictionary = corpora.Dictionary(tokenized_abstracts_per_cluster)
  for i,doc in enumerate(tokenized_abstracts_per_cluster):
      # Convert the document to bag of words format
      doc_bow = dictionary.doc2bow(doc)
      # Get the topic distribution for the document
      topic_distribution = lda_models[cluster].get_document_topics(doc_bow)
      # Extract the dominant topic and increment the count for that topic
      dominant_topic = max(topic_distribution, key=lambda x: x[1])[0]
      if(dominant_topic==topic_num):
        abstracts_per_topic.append(abstracts_per_cluster[i])
        abstracts_emb_per_topic.append(abstracts_per_cluster_emb[i])
  return abstracts_per_topic, abstracts_emb_per_topic

def find_indices(x, y):
    # Initialize a list to store indices
    indices_list = []

    # Create a dictionary to map elements to their indices in x
    index_map = {value: index for index, value in enumerate(x)}

    # Iterate over each element in y
    for elem in y:
        # Find the index of elem in x using the dictionary
        index = index_map.get(elem)
        # If index is not None, append it to the indices_list
        if index is not None:
            indices_list.append(index)
        # If index is None, append None to indicate that elem is not found in x
        else:
            indices_list.append(None)

    return indices_list
def generate_summary_prompt(topic_label, abstracts):
    """
    Generate a prompt for summarizing abstracts dominated by a specific topic.

    Args:
    - topic_label (str): The label of the topic.
    - abstracts (list of str): A list of abstracts dominated by the specified topic.

    Returns:
    - prompt (str): The generated prompt.
    """
    # Construct the prompt header with the topic label
    prompt = f"Topic Label: {topic_label}\n\n"

    # Add each abstract to the prompt under the "Abstracts Dominated by this Topic" section
    prompt += "Abstracts Dominated by this Topic:\n"
    for i, abstract in enumerate(abstracts, start=1):
        prompt += f"{i}-{abstract}\n"

    # Add the summary section with a placeholder
    prompt += "\nSummary:\n"

    return prompt

def select_top_abstracts(topic_label_embedding, abstract_embeddings, abstracts, top_n):
    """
    Select the top N abstracts most similar to the topic label embedding.

    Args:
    - topic_label_embedding (np.array): SBERT embedding of the topic label.
    - abstract_embeddings (list of np.array): List of SBERT embeddings of abstracts.
    - abstracts (list of str): List of abstracts.
    - top_n (int): Number of top abstracts to select.

    Returns:
    - top_abstracts (list of str): List of top N abstracts most similar to the topic label.
    """
    # Calculate cosine similarity between the topic label embedding and abstract embeddings
    similarities = np.array(util.cos_sim(topic_label_embedding, abstract_embeddings))[0]
    # Sort abstracts based on similarity scores in descending order
    sorted_indices = np.argsort(similarities)[::-1]

    # Select the top N abstracts
    top_abstracts = [abstracts[i] for i in sorted_indices[:top_n]]

    return top_abstracts

In [None]:
# @title generate the summary
topic_label=input("Enter the topic you want to summarize: ")
topic_info=labeled_topics[labeled_topics["Topic Label"]==topic_label].reset_index(drop=True)
cluster=topic_info["Cluster"][0]
merged_topics=topic_info['Merged Topics'][0].split(',')
merged_topics_list=[int(num) for num in merged_topics]
topic_label_emb=model.encode(topic_label)
cluster_indices=np.array(np.where(cluster_labels==cluster))
abstracts=[selected_processed_abstracts[i] for i in cluster_indices[0]]
abstracts_emb=[selected_processed_abstracts_embeddings[i] for i in cluster_indices[0]]
processed_abstracts_per_topic=[]
processed_abstracts_emb_per_topic=[]

for top_ind in range(len(merged_topics_list)):
  abs_list,abs_emb_list=get_topic(abstracts, abstracts_emb,merged_topics_list[top_ind],cluster,lda_models)
  processed_abstracts_per_topic=processed_abstracts_per_topic+abs_list
  processed_abstracts_emb_per_topic=processed_abstracts_emb_per_topic+abs_emb_list

top_relevant_abstracts_for_topic=select_top_abstracts(topic_label_emb,np.array(processed_abstracts_emb_per_topic), processed_abstracts_per_topic,50)
temp=find_indices(processed_abstracts,top_relevant_abstracts_for_topic)
abstracts_per_topic=[df_sorted['abstract'][i] for i in temp]
prompt=generate_summary_prompt(topic_label,abstracts_per_topic)

key=os.getenv("YOUR_API_KEY") # Replace it with the name of your env file
summary=generate_summary(prompt,key)
print(summary)