Scrape Schoolar

In [1]:
from selenium import webdriver
from selenium.common import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd

# Setup browser
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)

# Nama penulis dan afiliasi
first_name = "Kusman"
last_name = "Sadik"
affilitate = "IPB University"
affilitate_full = "Institut Pertanian Bogor"

# List untuk menyimpan data
results = []

def scrape_google_scholar():
    driver.get("https://scholar.google.com")

    # Input pencarian
    driver.find_element(By.ID, "gs_hdr_tsi").send_keys(f"{first_name} {last_name} {affilitate}")
    driver.find_element(By.ID, "gs_hdr_tsb").click()

    # Klik profil penulis
    user_element = WebDriverWait(driver, 5).until(
        EC.visibility_of_any_elements_located((By.CLASS_NAME, "gs_rt2"))
    )[0]
    driver.get(user_element.find_element(By.TAG_NAME, "a").get_attribute("href"))

    # Load semua publikasi
    while True:
        try:
            load_more_button = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.ID, "gsc_bpf_more"))
            )

            # jika tombol tidak aktif, berhenti
            if not load_more_button.is_enabled():
                break

            # cek apakah tombol benar-benar bisa diklik
            WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.ID, "gsc_bpf_more"))
            )
            load_more_button.click()

            # beri jeda agar tidak bentrok saat loading
            WebDriverWait(driver, 2).until(
                EC.invisibility_of_element_located((By.CLASS_NAME, "gs_md_d"))
            )

        except (NoSuchElementException, ElementClickInterceptedException):
            break


    # Ekstrak data publikasi
    publications = driver.find_elements(By.CLASS_NAME, "gsc_a_tr")
    for publication in publications:
        title_element = publication.find_element(By.CLASS_NAME, "gsc_a_t")
        title = title_element.find_element(By.CLASS_NAME, "gsc_a_at").text.replace(';', ',')
        cited_count = publication.find_element(By.CLASS_NAME, "gsc_a_ac").text.replace(';', ',')
        year = publication.find_element(By.CLASS_NAME, "gsc_a_h").text.replace(';', ',')

        descriptions = title_element.find_elements(By.TAG_NAME, "div")
        author = descriptions[0].text.replace(';', ',') if len(descriptions) > 0 else ""
        journal_name = descriptions[1].text.replace(';', ',') if len(descriptions) > 1 else ""

        results.append({
            "source": "Google Scholar",
            "type": "Article",
            "title": title,
            "author": author,
            "publisher": journal_name,
            "year": year,
            "quartile": "N/A",
            "cited": cited_count,
            "is_success": "N/A",
            "info": "N/A"
        })

# Jalankan scraping
scrape_google_scholar()
driver.quit()

# Simpan ke Excel
df = pd.DataFrame(results)
output_path = "schoolar.xlsx"
df.to_excel(output_path, index=False)

print(f"✅ Data berhasil disimpan di: {output_path}")

✅ Data berhasil disimpan di: schoolar.xlsx


Scrape SINTA

In [3]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from getpass import getpass

# === 1. Input login SINTA ===
USERNAME = input("Masukkan username/email SINTA: ")
PASSWORD = getpass("Masukkan password SINTA: ")

# === 2. Setup Chrome ===
options = uc.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--headless')  # Hapus komentar jika ingin melihat browser

driver = uc.Chrome(options=options)
wait = WebDriverWait(driver, 20)

try:
    # === 3. Login ke SINTA ===
    driver.get("https://sinta.kemdikbud.go.id/logins")
    wait.until(EC.presence_of_element_located((By.NAME, "username")))
    driver.find_element(By.NAME, "username").send_keys(USERNAME)
    driver.find_element(By.NAME, "password").send_keys(PASSWORD)
    driver.find_element(By.XPATH, "//button[contains(text(), 'Login')]").click()
    wait.until(EC.url_changes("https://sinta.kemdikbud.go.id/logins"))
    print("✅ Login berhasil!")

    # === 4. Akses halaman Profil ===
    url = "https://sinta.kemdikbud.go.id/authors/profile/36142"
    driver.get(url)
    print("➡ Mengakses profil pengguna...")
    time.sleep(3)

    profil = []

    try:
        # === Nama ===
        nama = driver.find_element(By.CSS_SELECTOR, "div.row.p-3 h3").text.strip()

        # === Institusi ===
        institusi = driver.find_element(By.XPATH, "//a[contains(@href, '/affiliations/profile/')]").text.strip()

        # === Departemen ===
        try:
            departemen = driver.find_element(By.XPATH, "//a[contains(@href, '/departments/profile/')]").text.strip()
        except:
            departemen = ""

        # === SINTA ID ===
        sinta_id = ""
        sinta_links = driver.find_elements(By.CSS_SELECTOR, ".meta-profile a")
        for a in sinta_links:
            if "SINTA ID" in a.text:
                sinta_id = a.text.strip().split(":")[-1].strip()

        # === Subject Area ===
        try:
            subject_elements = driver.find_elements(By.CSS_SELECTOR, "ul.subject-list li")
            subject_areas = [li.text.strip() for li in subject_elements]
            subject_area = "; ".join(subject_areas)
        except:
            subject_area = ""

        # === Skor dari .stat-profile ===
        sinta_score_overall = sinta_score_3yr = affil_score = affil_score_3yr = ""
        try:
            wait.until(EC.presence_of_element_located((By.CLASS_NAME, "stat-profile")))
            stat_section = driver.find_element(By.CLASS_NAME, "stat-profile")
            rows = stat_section.find_elements(By.CLASS_NAME, "col-sm-4")

            if len(rows) >= 4:
                sinta_score_overall = rows[0].text.split("\n")[0].strip()
                sinta_score_3yr = rows[1].text.split("\n")[0].strip()
                affil_score = rows[2].text.split("\n")[0].strip()
                affil_score_3yr = rows[3].text.split("\n")[0].strip()
            else:
                print("⚠ Jumlah skor kurang dari 4")

        except Exception as e:
            print("⚠ Gagal mengambil skor dari stat-profile:", e)


        # === Simpan ke list ===
        profil.append({
            "Nama": nama,
            "Institusi": institusi,
            "Departemen": departemen,
            "SINTA ID": sinta_id,
            "Subject Area": subject_area,
            "SINTA Score Overall": sinta_score_overall,
            "SINTA Score 3Yr": sinta_score_3yr,
            "Affil Score": affil_score,
            "Affil Score 3Yr": affil_score_3yr
        })

    except Exception as e:
        print("⚠ Gagal memproses profil:", e)

    # === 5. Simpan ke Excel ===
    if profil:
        df_profil = pd.DataFrame(profil)
        df_profil.to_excel("profil_sinta.xlsx", index=False)
        print("✅ Data disimpan ke profil_sinta.xlsx")
    else:
        print("⚠ Tidak ada data ditemukan.")

finally:
    driver.quit()

✅ Login berhasil!
➡ Mengakses profil pengguna...
✅ Data disimpan ke profil_sinta.xlsx


Scrape Scopus

In [4]:
import requests
import pandas as pd
import time

# Set API Key dan Author ID Scopus
api_key = "81b43f2badb4f475df7a6c3d6761e9d5"
author_id = "57190939346"

# URL dasar untuk Scopus Search API
base_url = "https://api.elsevier.com/content/search/scopus"

# Parameter paginasi
count = 25
start = 0
all_publications = []

# Looping ambil data
while True:
    params = {
        "query": f"AU-ID({author_id})",
        "count": count,
        "start": start,
        "field": "dc:title,prism:publicationName,prism:coverDate,eid,citedby-count"
    }

    headers = {
        "X-ELS-APIKey": api_key,
        "Accept": "application/json"
    }

    response = requests.get(base_url, params=params, headers=headers)

    if response.status_code != 200:
        print(f"Error: {response.status_code}\n{response.text}")
        break

    data = response.json()
    entries = data.get("search-results", {}).get("entry", [])

    if not entries:
        break

    df = pd.json_normalize(entries)
    all_publications.append(df)

    print(f"✅ Retrieved {len(entries)} entries. Total so far: {len(all_publications)}")

    total_results = int(data["search-results"].get("opensearch:totalResults", 0))
    if start + count >= total_results:
        break

    start += count
    time.sleep(1)

# Gabungkan semua DataFrame
pub_df_raw = pd.concat(all_publications, ignore_index=True)

# Bersihkan dan transformasi kolom
pub_df = pd.DataFrame({
    "Title": pub_df_raw.get("dc:title", pub_df_raw.get("prism:title")),
    "Journal": pub_df_raw.get("prism:publicationName"),
    "Publication_Date": pub_df_raw.get("prism:coverDate"),
    "Citations": pd.to_numeric(pub_df_raw.get("citedby-count", 0), errors="coerce").fillna(0).astype(int),
    "EID": pub_df_raw.get("eid")
})

pub_df["Tahun"] = pub_df["Publication_Date"].str[:4]
pub_df.insert(0, "No", range(1, len(pub_df) + 1))

# Tampilkan hasil
print("\n📄 Data publikasi yang berhasil diproses:")
print(pub_df.head(10))

# Simpan ke Excel (jika ingin)
pub_df.to_excel("scopus_result.xlsx", index=False)


✅ Retrieved 25 entries. Total so far: 1
✅ Retrieved 24 entries. Total so far: 2

📄 Data publikasi yang berhasil diproses:
   No                                              Title  \
0   1  Simulation and Empirical Studies of Long Short...   
1   2  MODELING THE INCIDENCE OF MALNUTRITION IN BOGO...   
2   3  Analyzing multilevel model of educational data...   
3   4  A PRELIMINARY STUDY OF SENTIMENT ANALYSIS ON C...   
4   5  SIMULATION STUDY OF HIERARCHICAL BAYESIAN APPR...   
5   6  TRANSFER FUNCTION AND ARIMA MODEL FOR FORECAST...   
6   7  MTSClust with Handling Missing Data Using VAR-...   
7   8  Vector Autoregressive-Moving Average Imputatio...   
8   9  POISSON-LOGNORMAL MODEL WITH MEASUREMENT ERROR...   
9  10  Simulation for Time Series Classification Usin...   

                                             Journal Publication_Date  \
0                          Jurnal Online Informatika       2025-04-01   
1                                           Barekeng       2024-06-01  