In [3]:
import pandas as pd
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm  # Import tqdm for the progress bar

# Load the pre-trained BERT model
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
kw_model = KeyBERT(model=model)

# Load your CSV file
df = pd.read_csv('/Users/asa/VScode/liter/literature.csv')

# Create an instance of KeyphraseCountVectorizer
vectorizer = KeyphraseCountVectorizer()

def extract_top_three_keyphrases(text):
    keywords = kw_model.extract_keywords(text, vectorizer=vectorizer, stop_words='english', top_n=3, use_mmr=True)
    
    if keywords:
        extracted_keyphrases = [item[0] for item in keywords[:3]]  # 提取前三个关键词短语
        return extracted_keyphrases
    else:
        print("No keyphrases extracted.")
        return None

# Apply the function to your column with tqdm for progress bar
tqdm.pandas()  # Use tqdm's progress_apply instead of apply
df['extracted_3keyword'] = df['keywords'].progress_apply(extract_top_three_keyphrases)

# Save the updated DataFrame back to a CSV
df.to_csv('distill_literature.csv', index=False)  # The new CSV will have an additional column with extracted keywords
print("Keyphrase extraction complete!")

100%|██████████| 5971/5971 [23:57<00:00,  4.15it/s]


Keyphrase extraction complete!
