In [None]:
import csv
import re
import time
import requests
from tqdm import tqdm

# 設定 CrossRef API 的基本參數
CROSSREF_BASE_URL = "https://api.crossref.org/works"

# 設定 OpenCitations API 的基本參數
OPENCITATIONS_INDEX_URL = "https://opencitations.net/index/api/v2/citation-count/doi:"

# 設定查詢條件
search_querys = ["'information technology capability'", "'organizational resilience'"]
targets = [50, 10]  # 最低引用次數/年

for n, search_query in enumerate(search_querys):
    result = []

    # 查詢參數
    rows = 500  # 每次請求的結果數量（CrossRef 建議使用較小的數量以避免過載）
    offset = 0  # 每次偏移量
    cursor = "*"  # 起始游標
    target = targets[n]  # 最低引用次數/年
    flag = True

    # 初始化 CSV 文件
    csv_file = search_query.replace("'", "").replace(" ", "_")+".csv"
    result.append(["Title", "Abstract", "Citations", "Year", "Link"])

    # 構建 CrossRef 請求參數
    params = {
        "query.bibliographic": search_query,
        "rows": rows,
        "select": "title,abstract,published-print,published-online,created,URL,DOI,is-referenced-by-count",
        "filter": "from-pub-date:2020-01-01,has-abstract:true",
        "sort": "is-referenced-by-count",
        "order": "desc",
        "cursor": cursor
    }

    try:
        # 發送 GET 請求到 CrossRef API
        response = requests.get(
            CROSSREF_BASE_URL, params=params, timeout=10)
        response.raise_for_status()  # 如果響應狀態碼不是 200，則引發異常
    except requests.exceptions.RequestException as e:
        print(f"Error fetching CrossRef results: {e}")
        flag = False

    # 解析 JSON 響應
    data = response.json()

    # 獲取總結果數量（僅在第一次請求時獲取）
    total_results = data["message"]["total-results"]
    if total_results == 0:
        print(f"無查詢結果。")
        flag = False

    # 獲取當前頁的項目
    items = data["message"]["items"]
    if not items:
        print("無查詢項目。")
        flag = False

    # 獲取下一頁的游標
    cursor = data["message"]["next-cursor"]

    loop = tqdm(range(total_results))
    loop.set_description(search_query.replace("'", ""))
    for i in loop:
        if i >= rows and i % rows == 0:
            # 構建 CrossRef 請求參數
            params = {
                "query.bibliographic": search_query,
                "rows": rows,
                "select": "title,abstract,published-print,published-online,created,URL,DOI,is-referenced-by-count",
                "filter": "from-pub-date:2020-01-01,has-abstract:true",
                "sort": "is-referenced-by-count",
                "order": "desc",
                "cursor": cursor
            }
            try:
                # 發送 GET 請求到 CrossRef API
                response = requests.get(
                    CROSSREF_BASE_URL, params=params, timeout=10)
                response.raise_for_status()  # 如果響應狀態碼不是 200，則引發異常

                # 解析 JSON 響應
                data = response.json()

                # 獲取當前頁的項目
                items = data["message"]["items"]
                if not items:
                    print("無查詢項目。")
                    continue

                # 獲取下一頁的游標
                cursor = data["message"]["next-cursor"]
                offset += rows

                # 為避免觸發速率限制，適當添加延遲
                time.sleep(0.3)
            except requests.exceptions.RequestException as e:
                print(f"Error fetching CrossRef results: {e}")
                continue

        item = items[i-offset]

        # 提取 DOI
        doi = item.get("DOI", None)

        # 提取被引用次數
        cite = item.get("is-referenced-by-count", 0)
        if cite <= target // 10:
            loop.close()
            break

        # 提取發布年份
        if "published-print" in item and "date-parts" in item["published-print"]:
            published_year = item["published-print"]["date-parts"][0][0]
        elif "published-online" in item and "date-parts" in item["published-online"]:
            published_year = item["published-online"]["date-parts"][0][0]
        elif "created" in item and "date-parts" in item["created"]:
            published_year = item["created"]["date-parts"][0][0]
        else:
            published_year = 0

        # 檢查缺值
        if doi and published_year >= 2020 and cite >= target:
            cite_year = cite/(2025-published_year)
            if cite_year >= 100:
                # 提取標題
                title = item.get("title", "No title available")[
                    0].replace("\n", " ").strip()

                # 提取摘要
                abstract = item.get(
                    "abstract", "No abstract available")
                abstract = re.sub(
                    r"\s{2,}", " ", abstract, flags=re.I | re.M)
                abstract = re.sub(r"<[^>]*>", "", abstract,
                                  flags=re.I | re.M).strip()

                # 提取連結
                link = item.get("URL", "No link available")

                # 使用 OpenCitations API 獲取引用次數
                citations = 0
                citations_year = 0

                # OpenCitations 使用 DOI 來查詢引用次數/年
                try:
                    header = {
                        "authorization": "6b17ce4d-1339-4e73-aa08-1ca3be1ebda6"}
                    # 查詢被引用次數
                    opencitations_url = OPENCITATIONS_INDEX_URL+doi
                    opencitations_response = requests.get(
                        opencitations_url, headers=header)
                    if opencitations_response.status_code == 200:
                        citations_data = opencitations_response.json()
                        citations = int(citations_data[0]["count"])
                    else:
                        print(opencitations_response.content)
                    if published_year > 0 and citations > 0:
                        citations_year = citations / \
                            (2025-published_year)

                    # 為避免觸發速率限制，適當添加延遲
                    time.sleep(0.3)
                except Exception as e:
                    print(
                        f"Error fetching OpenCitations for DOI {doi}: {e}")

                # 應用過濾條件：引用次數大於 target
                if citations_year >= target:
                    result.append(
                        [title, abstract, citations, published_year, link])

    with open(csv_file, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        for x in result:
            # 寫入
            writer.writerow(x)
    print(f"所有符合條件的數據已保存到 {csv_file}")

information technology capability:  10%|▉         | 55001/566201 [10:20<1:36:06, 88.66it/s] 


所有符合條件的數據已保存到 information technology capability.csv


organizational resilience:  28%|██▊       | 19501/68840 [04:30<11:23, 72.22it/s] 

所有符合條件的數據已保存到 organizational resilience.csv





TF

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.feature_extraction import text

for search_query in search_querys:
    csv_file = search_query.replace("'", "").replace(" ", "_")+".csv"
    df = pd.read_csv(csv_file)
    
    # 讀取清理過的文本
    cleaned_text = ""
    for x in df["Abstract"]:
        cleaned_text += x

    # 自定義停用詞
    custom_stop_words = {"et", "al", "amp", "bob", "yoo", "form",
                        "ing", "st", "es", "lower", "billion", "na", "dt", "dts"}

    # 將自定義停用詞與內建的英語停用詞結合
    stop_words = list(text.ENGLISH_STOP_WORDS.union(custom_stop_words))  # 轉換為列表格式

    # 使用 CountVectorizer 計算詞頻，支持生成詞句 (bi-grams, tri-grams)，並排除數字和單一字母
    vectorizer = CountVectorizer(
        stop_words=stop_words,
        token_pattern=r"\b[a-zA-Z]{2,}\b",  # 僅保留兩個或更多字母的詞彙
        ngram_range=(1, 3)  # 設置為 bi-grams 和 tri-grams
    )
    X = vectorizer.fit_transform([cleaned_text])

    # 獲取所有詞句和其詞頻
    phrases = vectorizer.get_feature_names_out()
    frequencies = X.toarray().flatten()

    # 創建詞頻數據框
    df_tf = pd.DataFrame({"phrase": phrases, "frequency": frequencies})

    # 按詞頻排序並顯示前 10 個最常見的詞句
    df_tf = df_tf.sort_values(by="frequency", ascending=False)
    print(df_tf.head(10))

    # 存儲為 CSV 文件
    output_csv_path = csv_file[:-4]+"_TF.csv"
    df_tf.to_csv(output_csv_path, index=False)

          phrase  frequency
7701    research         30
5248    learning         28
2281        data         27
9032  technology         22
9563        used         21
7880      review         19
6050         new         19
1517     chatgpt         18
9533         use         18
6524    patients         17
          phrase  frequency
1891    pandemic         17
2284  resilience         14
844     distress         13
2238    research         13
2331      review         12
610        covid         12
987       energy         11
471      concept         10
2880         vsc          9
946    emotional          8


TF-IDF

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.feature_extraction import text

for search_query in search_querys:
    csv_file = search_query.replace("'", "").replace(" ", "_")+".csv"
    df = pd.read_csv(csv_file)

    # 讀取清理過的文本
    cleaned_text = ""
    for x in df["Abstract"]:
        cleaned_text += x

    # 自定義停用詞
    custom_stop_words = {"et", "al", "amp", "bob", "yoo", "form",
                        "ing", "st", "es", "lower", "billion", "na", "dt", "dts"}

    # 將自定義停用詞與內建的英語停用詞結合
    stop_words = list(text.ENGLISH_STOP_WORDS.union(custom_stop_words))  # 轉換為列表格式

    # 使用 TfidfVectorizer 計算 TF-IDF，支持生成詞句 (bi-grams, tri-grams)，並排除數字和單一字母
    vectorizer = TfidfVectorizer(
        stop_words=stop_words,
        token_pattern=r"\b[a-zA-Z]{2,}\b",  # 僅保留兩個或更多字母的詞彙
        ngram_range=(1, 3)  # 設置為 bi-grams 和 tri-grams
    )
    X = vectorizer.fit_transform([cleaned_text])

    # 獲取所有詞句和其 TF-IDF 分數
    phrases = vectorizer.get_feature_names_out()
    tfidf_scores = X.toarray().flatten()

    # 創建 TF-IDF 數據框
    df_tfidf = pd.DataFrame({"phrase": phrases, "tfidf_score": tfidf_scores})

    # 按 TF-IDF 分數排序並顯示前 10 個最重要的詞句
    df_tfidf = df_tfidf.sort_values(by="tfidf_score", ascending=False)
    print(df_tfidf.head(10))

    # 存儲為 CSV 文件
    output_csv_path = csv_file[:-4]+"_TF-IDF.csv"
    df_tfidf.to_csv(output_csv_path, index=False)

          phrase  tfidf_score
7701    research     0.167418
5248    learning     0.156256
2281        data     0.150676
9032  technology     0.122773
9563        used     0.117192
7880      review     0.106031
6050         new     0.106031
1517     chatgpt     0.100451
9533         use     0.100451
6524    patients     0.094870
          phrase  tfidf_score
1891    pandemic     0.206596
2284  resilience     0.170138
844     distress     0.157985
2238    research     0.157985
2331      review     0.145833
610        covid     0.145833
987       energy     0.133680
471      concept     0.121527
2880         vsc     0.109375
946    emotional     0.097222


KeyBERT

In [7]:
import csv
import re
import pandas as pd
from keybert import KeyBERT

for search_query in search_querys:
    csv_file = search_query.replace("'", "").replace(" ", "_")+".csv"
    df = pd.read_csv(csv_file)

    # 讀取清理過的文本
    cleaned_text = ""
    for x in df["Abstract"]:
        cleaned_text += x+"\n"

    # 加載 KeyBERT 模型
    model = KeyBERT('Paraphrase-mpnet-base-v2')

    # 提取關鍵詞
    keywords = model.extract_keywords(cleaned_text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=100)

    # 過濾掉包含數字的關鍵詞並篩選分數大於0.6的
    nowords = r"\d|biochar|research"
    filtered_keywords = [(keyword, score) for keyword, score in keywords if not re.search(nowords, keyword) and score > 0.3]

    # 輸出關鍵詞
    for keyword, score in filtered_keywords:
        print(f'Keywords: {keyword}, Score: {score}')

    # 將過濾後的關鍵詞轉換為 DataFrame
    df_keywords = pd.DataFrame(filtered_keywords, columns=['Keywords', 'Score'])

    # 存儲為 CSV 文件
    output_csv_path = csv_file[:-4]+"_KeyBERT.csv"
    df_keywords.to_csv(output_csv_path, index=False)

  from tqdm.autonotebook import tqdm, trange


Keywords: biotechnology information, Score: 0.5919
Keywords: biotechnology, Score: 0.5445
Keywords: applied biotechnology, Score: 0.5087
Keywords: biotechnology biomedical, Score: 0.4908
Keywords: center biotechnology, Score: 0.488
Keywords: bioinformatics, Score: 0.4812
Keywords: science pedagogy, Score: 0.4776
Keywords: based biophysical, Score: 0.4767
Keywords: edtech, Score: 0.4664
Keywords: science education, Score: 0.4647
Keywords: edtech start, Score: 0.4536
Keywords: pubmed science, Score: 0.4505
Keywords: advances bioinformatics, Score: 0.4429
Keywords: scientific community, Score: 0.4399
Keywords: life science, Score: 0.4393
Keywords: web science, Score: 0.4308
Keywords: biofunctionality provide, Score: 0.4302
Keywords: online learning, Score: 0.429
Keywords: online information, Score: 0.4284
Keywords: online teaching, Score: 0.4222
Keywords: science journals, Score: 0.42
Keywords: laboratory, Score: 0.4192
Keywords: biophysical, Score: 0.4158
Keywords: crop biophysical, Scor