# Preprosesing PTA dan Berita

In [1]:
!pip install builtwith
!pip install nltk
!pip install Sastrawi
!pip install pyspellchecker

Collecting builtwith
  Downloading builtwith-1.3.4.tar.gz (34 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: builtwith
  Building wheel for builtwith (setup.py) ... [?25l[?25hdone
  Created wheel for builtwith: filename=builtwith-1.3.4-py3-none-any.whl size=36077 sha256=74652d472a0dd50f9eb3c4f7f77b383b0fc07d55df95f202682536b88ad43fc0
  Stored in directory: /root/.cache/pip/wheels/7f/2d/b2/606e3df914d4aeeab99c4a4e3e9a61673d2293c2e346db00c8
Successfully built builtwith
Installing collected packages: builtwith
Successfully installed builtwith-1.3.4
Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1
Collecting pyspellchecker
  Downloading pysp

## Crawling PTA

In [2]:
import builtwith

# Analisis teknologi yang digunakan
res = builtwith.parse('https://pta.trunojoyo.ac.id')
print(res)

{'web-servers': ['Nginx'], 'javascript-frameworks': ['jQuery', 'jQuery UI']}


## Preprocessing Crawling PTA

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import sys
import re
import string
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from spellchecker import SpellChecker
from collections import Counter

# --- KONFIGURASI PRA-PEMROSESAN ---
# Kamus untuk normalisasi kata slang/kontraksi bahasa Indonesia
contractions_dict = {
    "gak": "tidak", "ga": "tidak", "nggak": "tidak", "enggak": "tidak", "ngga": "tidak", "gk": "tidak",
    "gue": "saya", "gw": "saya", "gua": "saya", "lu": "kamu", "loe": "kamu",
    "dah": "sudah", "udah": "sudah", "aja": "saja", "ajah": "saja",
    "yg": "yang", "utk": "untuk", "dlm": "dalam", "dr": "dari", "dg": "dengan",
    "jd": "jadi", "krn": "karena", "tp": "tetapi", "tapi": "tetapi",
    "banget": "sekali", "bgt": "sekali", "lg": "lagi",
}

# Daftar stopword bahasa Indonesia
stop_words = set([
    "yang", "di", "ke", "dan", "dari", "ini", "itu", "pada", "untuk",
    "dengan", "sebagai", "adalah", "merupakan", "dalam", "yaitu",
    "suatu", "sebuah", "dengan", "akan", "telah", "bisa", "agar",
    "dari", "oleh", "hal", "saat", "bahwa", "juga", "atau", "tidak",
    "namun", "tetapi", "kemudian", "sehingga", "serta", "guna",
    "seperti", "yaitu", "melalui", "terhadap", "seperti", "sejak",
    "saja", "hanya", "oleh", "pada", "sampai", "setelah", "sebelum",
    "karena", "maka", "tanpa"
])

# Inisialisasi Stemmer dari Sastrawi
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Inisialisasi SpellChecker
spell = SpellChecker()

# --- FUNGSI-FUNGSI PRA-PEMROSESAN TERPISAH ---

def clean_base_text(text):
    """
    Fungsi dasar untuk membersihkan teks dari simbol, angka, dll.
    """
    if not isinstance(text, str):
        return ''

    text = text.lower()
    words = text.split()
    expanded_words = [contractions_dict.get(word, word) for word in words]
    text = ' '.join(expanded_words)

    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    text = BeautifulSoup(text, "html.parser").get_text()

    return text

def tokenize_text(text):
    """
    # Proses: Tokenisasi
    # Memisahkan teks menjadi daftar kata (token).
    """
    return text.split()

def remove_stopwords(tokens):
    """
    # Proses: Penghapusan Kata Umum (Stopword)
    # Menghapus kata-kata yang tidak memiliki makna penting dalam analisis teks.
    """
    return [word for word in tokens if word not in stop_words]

def apply_stemming_and_lemmatization(tokens):
    """
    # Proses: Stemming dan Lematisasi
    # Mengubah kata berimbuhan menjadi kata dasar.
    """
    text = ' '.join(tokens)
    stemmed_text = stemmer.stem(text)
    return stemmed_text.split()

def correct_spelling(tokens):
    """
    # Proses: Cek Ejaan Pembakuan Kata
    # Memperbaiki ejaan kata.
    """
    corrected_words = [spell.correction(word) if spell.correction(word) is not None else word for word in tokens]
    return corrected_words

# --- FUNGSI-FUNGSI BANTUAN SCRAPING ---

def get_text_or_na(soup, selectors):
    """Mencoba beberapa selektor untuk menemukan teks, mengembalikan 'N/A' jika tidak ditemukan."""
    for selector in selectors:
        element = soup.select_one(selector)
        if element:
            text = element.get_text(strip=True)
            if text and text.strip():
                return text.strip()
    return 'N/A'

def get_data_from_span(soup, text_contains):
    """Mencari span yang berisi teks tertentu dan mengekstrak nama setelah titik dua."""
    span = soup.select_one(f'span:-soup-contains("{text_contains}")')
    if span:
        text = span.get_text(strip=True)
        if ':' in text:
            return text.split(':', 1)[1].strip()
    return 'N/A'

def get_abstract_robust(soup, keywords):
    """
    Mencari tag heading (b) yang teksnya mengandung salah satu dari kata kunci
    dan mengekstrak paragraf berikutnya.
    """
    for tag_b in soup.find_all('b'):
        tag_text = tag_b.get_text(strip=True)
        if any(keyword.lower() in tag_text.lower() for keyword in keywords):
            parent_div = tag_b.find_parent('div')
            if parent_div:
                next_div_sibling = parent_div.find_next_sibling('div')
                if next_div_sibling:
                    p_tag = next_div_sibling.find('p', align="justify")
                    if p_tag:
                        abstract_text = p_tag.get_text(strip=True)
                        if abstract_text:
                            return abstract_text
    return 'N/A'

def get_total_pages(soup):
    """
    Mengekstrak total jumlah halaman dari navigasi paginasi.
    """
    try:
        pagination = soup.select_one('ol.pagination')
        if pagination:
            last_page_li = pagination.select('li')[-1]
            last_page_link = last_page_li.select_one('a')
            if last_page_link and 'href' in last_page_link.attrs:
                url_path = last_page_link['href']
                return int(url_path.split('/')[-1])
    except (IndexError, ValueError, KeyError):
        pass
    return 1

# --- FUNGSI UTAMA SCRAPING ---

def scrape_teknik_all_data():
    """
    Menjalankan proses scraping dan pra-pemrosesan data secara keseluruhan.
    """
    prodi_data = [
        {'name': 'Teknik Industri', 'url': 'https://pta.trunojoyo.ac.id/c_search/byprod/9'},
        {'name': 'Teknik Informatika', 'url': 'https://pta.trunojoyo.ac.id/c_search/byprod/10'},
        {'name': 'Manajemen Informatika', 'url': 'https://pta.trunojoyo.ac.id/c_search/byprod/11'},
        {'name': 'Teknik Multimedia Dan Jaringan', 'url': 'https://pta.trunojoyo.ac.id/c_search/byprod/19'},
        {'name': 'Mekatronika', 'url': 'https://pta.trunojoyo.ac.id/c_search/byprod/20'},
        {'name': 'Teknik Elektro', 'url': 'https://pta.trunojoyo.ac.id/c_search/byprod/23'},
        {'name': 'Sistem Informasi', 'url': 'https://pta.trunojoyo.ac.id/c_search/byprod/31'},
        {'name': 'Teknik Mesin', 'url': 'https://pta.trunojoyo.ac.id/c_search/byprod/32'},
        {'name': 'Teknik Mekatronika', 'url': 'https://pta.trunojoyo.ac.id/c_search/byprod/33'}
    ]

    all_scraped_data = {
        "penulis": [], "judul": [], "pembimbing_pertama": [], "pembimbing_kedua": [],
        "abstrak_indonesia_raw": [], "abstrak_inggris_raw": [],
        "abstrak_indonesia_clean": [], "abstrak_inggris_clean": [],
        "abstrak_indonesia_stopwords": [], "abstrak_inggris_stopwords": [],
        "abstrak_indonesia_stemmed": [], "abstrak_inggris_stemmed": [],
        "abstrak_indonesia_corrected": [], "abstrak_inggris_corrected": [],
        "prodi": []
    }

    total_data_count = 0
    print("--- MULAI PROSES SCRAPING FAKULTAS TEKNIK ---")

    for prodi in prodi_data:
        print(f"\nScraping data for program: {prodi['name']}")
        print("-" * 50)
        first_page_url = prodi['url']
        try:
            r = requests.get(first_page_url)
            r.raise_for_status()
            soup = BeautifulSoup(r.content, "html.parser")
            total_pages = get_total_pages(soup)
            print(f"Ditemukan {total_pages} halaman untuk {prodi['name']}")
        except requests.exceptions.RequestException as e:
            print(f"ERROR: Gagal mengambil halaman pertama untuk {prodi['name']}: {e}", file=sys.stderr)
            total_pages = 1

        for i in range(1, total_pages + 1):
            url = f"{prodi['url']}/{i}"
            try:
                r = requests.get(url)
                r.raise_for_status()
                soup = BeautifulSoup(r.content, "html.parser")
                jurnals = soup.select('li[data-cat="#luxury"]')

                if not jurnals:
                    print(f"Tidak ada jurnal lagi di halaman {i}. Berhenti untuk program ini.")
                    break

                for jurnal in jurnals:
                    jurnal_url = jurnal.select_one('a.gray.button')['href']
                    try:
                        response = requests.get(jurnal_url)
                        response.raise_for_status()
                        soup1 = BeautifulSoup(response.content, "html.parser")
                        isi = soup1.select_one('div#content_journal')
                        if isi:
                            judul = get_text_or_na(isi, ['a.title', 'b.title', 'h2.title'])
                            penulis = get_data_from_span(isi, "Penulis")
                            pembimbing_pertama = get_data_from_span(isi, "Dosen Pembimbing I")
                            pembimbing_kedua = get_data_from_span(isi, "Dosen Pembimbing II")
                            abstrak_indonesia = get_abstract_robust(isi, ["Abstraksi", "Abstrak"])
                            abstrak_inggris = get_abstract_robust(isi, ["Abstraction", "Abstract", "ABSTRACT"])

                            # --- Rangkaian Proses Pra-pemrosesan Teks ---

                            # 1. Pembersihan Dasar (lowercase, normalisasi slang, hapus simbol & angka)
                            clean_indonesia_text = clean_base_text(abstrak_indonesia)
                            clean_inggris_text = clean_base_text(abstrak_inggris)

                            # 2. Tokenisasi
                            tokens_indonesia = tokenize_text(clean_indonesia_text)
                            tokens_inggris = tokenize_text(clean_inggris_text)

                            # 3. Penghapusan Stopword
                            stopwords_indonesia_tokens = remove_stopwords(tokens_indonesia)
                            stopwords_inggris_tokens = remove_stopwords(tokens_inggris)

                            # 4. Stemming dan Lematisasi
                            stemmed_indonesia_tokens = apply_stemming_and_lemmatization(stopwords_indonesia_tokens)
                            stemmed_inggris_tokens = apply_stemming_and_lemmatization(stopwords_inggris_tokens)

                            # 5. Cek Ejaan (Pembakuan Kata)
                            corrected_indonesia_tokens = correct_spelling(stemmed_indonesia_tokens)
                            corrected_inggris_tokens = correct_spelling(stemmed_inggris_tokens)

                            # Menghitung frekuensi kata untuk abstrak yang sudah diproses
                            word_frequency_indonesia = Counter(stemmed_indonesia_tokens)
                            word_frequency_inggris = Counter(stemmed_inggris_tokens)

                            # Menambahkan data ke dictionary
                            all_scraped_data["penulis"].append(penulis)
                            all_scraped_data["judul"].append(judul)
                            all_scraped_data["pembimbing_pertama"].append(pembimbing_pertama)
                            all_scraped_data["pembimbing_kedua"].append(pembimbing_kedua)
                            all_scraped_data["prodi"].append(prodi['name'])
                            all_scraped_data["abstrak_indonesia_raw"].append(abstrak_indonesia)
                            all_scraped_data["abstrak_inggris_raw"].append(abstrak_inggris)
                            all_scraped_data["abstrak_indonesia_clean"].append(' '.join(tokens_indonesia))
                            all_scraped_data["abstrak_inggris_clean"].append(' '.join(tokens_inggris))
                            all_scraped_data["abstrak_indonesia_stopwords"].append(' '.join(stopwords_indonesia_tokens))
                            all_scraped_data["abstrak_inggris_stopwords"].append(' '.join(stopwords_inggris_tokens))
                            all_scraped_data["abstrak_indonesia_stemmed"].append(' '.join(stemmed_indonesia_tokens))
                            all_scraped_data["abstrak_inggris_stemmed"].append(' '.join(stemmed_inggris_tokens))
                            all_scraped_data["abstrak_indonesia_corrected"].append(' '.join(corrected_indonesia_tokens))
                            all_scraped_data["abstrak_inggris_corrected"].append(' '.join(corrected_inggris_tokens))

                            total_data_count += 1
                            print(f"\n--- Data #{total_data_count} ---")
                            print(f"Prodi: {prodi['name']}")
                            print(f"Penulis: {penulis}")
                            print(f"Judul: {judul}")
                            print(f"Pembimbing 1: {pembimbing_pertama}")
                            print(f"Pembimbing 2: {pembimbing_kedua}")
                            print(f"Abstrak (Raw): {abstrak_indonesia}")
                            print(f"Abstrak (Clean): {clean_indonesia_text}")
                            print(f"Abstrak (Stopwords Removed): {' '.join(stopwords_indonesia_tokens)}")
                            print(f"Abstrak (Stemmed): {' '.join(stemmed_indonesia_tokens)}")
                            print(f"Abstrak (Corrected): {' '.join(corrected_indonesia_tokens)}")

                            # Output frekuensi kata
                            print("\n--- Tokenisasi (Perhitungan Jumlah Kata) ---")
                            print(f"Frekuensi Kata (Indonesia): {word_frequency_indonesia}")
                            print(f"Frekuensi Kata (Inggris): {word_frequency_inggris}")
                            print("-------------------------------------------\n")

                    except Exception as e:
                        print(f"ERROR: Terjadi kesalahan saat memproses URL: {jurnal_url} - {e}", file=sys.stderr)

                time.sleep(1)

            except requests.exceptions.RequestException as e:
                print(f"ERROR: Gagal mengambil URL {url}: {e}", file=sys.stderr)
                continue

    df = pd.DataFrame(all_scraped_data)
    df.to_csv("pta_teknik_all_preprocessing_steps.csv", index=False)
    print("\n---")
    print(f"Scraping selesai. Total data yang diambil: {total_data_count}")
    print("Data disimpan ke pta_teknik_all_preprocessing_steps.csv")
    print("---")
    return df

# Menjalankan fungsi utama
scrape_teknik_all_data()

[1;30;43mOutput streaming akan dipotong hingga 5000 baris terakhir.[0m

--- Data #30 ---
Prodi: Teknik Industri
Penulis: DWIKY ABDULLOH
Judul: OPTIMASI QUANTITY JUAL PRODUK BERAS DAN QUANTIY BELI GABAH PADA UD.SARI INDAH DENGAN METODE LINIER PROGRAMMING 
(STUDI KASUS:  UD.SARI INDAH, KEC. BENJENG - KAB. GRESIK)
Pembimbing 1: Samsul Amar, S.T., M.Sc.
Pembimbing 2: Mohamad Imron Mustajib, ST., MT.
Abstrak (Raw): Beras merupakan salah satu jenis makanan pokok yang dikonsumsi mayoritas masyarakat Indonesia. Berdasarkan data yang diperoleh dari Badan Pusat Statistik (BPS) dan Badan Urusan Logistik (BULOG) regional Jawa Timur terlihat harga beras yang fluktuatif. Penelitian ini dilakukan pada UD.Sari Indah sebuah bidang usaha penggilingan padi. Salah satu kondisi beras yang sangat berpengaruh pada profit pengusaha beras adalah harga beras yang fluktuatif. Dengan mengunakan metode Linier Programming dapat digunakan untuk mengoptimalkan profit yang akan dicapai oleh pengusaha beras apabila m

ERROR: Gagal mengambil URL https://pta.trunojoyo.ac.id/c_search/byprod/9/55: HTTPSConnectionPool(host='pta.trunojoyo.ac.id', port=443): Max retries exceeded with url: /c_search/byprod/9/55 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7b84c6a6c2f0>: Failed to establish a new connection: [Errno 113] No route to host'))
ERROR: Gagal mengambil URL https://pta.trunojoyo.ac.id/c_search/byprod/9/56: HTTPSConnectionPool(host='pta.trunojoyo.ac.id', port=443): Max retries exceeded with url: /c_search/byprod/9/56 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7b84c78d9520>: Failed to establish a new connection: [Errno 113] No route to host'))
ERROR: Gagal mengambil URL https://pta.trunojoyo.ac.id/c_search/byprod/9/57: HTTPSConnectionPool(host='pta.trunojoyo.ac.id', port=443): Max retries exceeded with url: /c_search/byprod/9/57 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7b84c78d82c0>: Failed to e


Scraping data for program: Teknik Informatika
--------------------------------------------------


ERROR: Gagal mengambil halaman pertama untuk Teknik Informatika: HTTPSConnectionPool(host='pta.trunojoyo.ac.id', port=443): Max retries exceeded with url: /c_search/byprod/10 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7b84c6ed12b0>: Failed to establish a new connection: [Errno 111] Connection refused'))
ERROR: Gagal mengambil URL https://pta.trunojoyo.ac.id/c_search/byprod/10/1: HTTPSConnectionPool(host='pta.trunojoyo.ac.id', port=443): Max retries exceeded with url: /c_search/byprod/10/1 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7b84c71eb620>: Failed to establish a new connection: [Errno 111] Connection refused'))



Scraping data for program: Manajemen Informatika
--------------------------------------------------


ERROR: Gagal mengambil halaman pertama untuk Manajemen Informatika: HTTPSConnectionPool(host='pta.trunojoyo.ac.id', port=443): Max retries exceeded with url: /c_search/byprod/11 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7b84c71eb260>: Failed to establish a new connection: [Errno 111] Connection refused'))
ERROR: Gagal mengambil URL https://pta.trunojoyo.ac.id/c_search/byprod/11/1: HTTPSConnectionPool(host='pta.trunojoyo.ac.id', port=443): Max retries exceeded with url: /c_search/byprod/11/1 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7b84c7180fe0>: Failed to establish a new connection: [Errno 111] Connection refused'))



Scraping data for program: Teknik Multimedia Dan Jaringan
--------------------------------------------------


ERROR: Gagal mengambil halaman pertama untuk Teknik Multimedia Dan Jaringan: HTTPSConnectionPool(host='pta.trunojoyo.ac.id', port=443): Max retries exceeded with url: /c_search/byprod/19 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7b84c6b3f230>: Failed to establish a new connection: [Errno 111] Connection refused'))
ERROR: Gagal mengambil URL https://pta.trunojoyo.ac.id/c_search/byprod/19/1: HTTPSConnectionPool(host='pta.trunojoyo.ac.id', port=443): Max retries exceeded with url: /c_search/byprod/19/1 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7b84c6b3e630>: Failed to establish a new connection: [Errno 111] Connection refused'))



Scraping data for program: Mekatronika
--------------------------------------------------


ERROR: Gagal mengambil halaman pertama untuk Mekatronika: HTTPSConnectionPool(host='pta.trunojoyo.ac.id', port=443): Max retries exceeded with url: /c_search/byprod/20 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7b84c75df320>: Failed to establish a new connection: [Errno 111] Connection refused'))
ERROR: Gagal mengambil URL https://pta.trunojoyo.ac.id/c_search/byprod/20/1: HTTPSConnectionPool(host='pta.trunojoyo.ac.id', port=443): Max retries exceeded with url: /c_search/byprod/20/1 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7b84c71ea7b0>: Failed to establish a new connection: [Errno 111] Connection refused'))



Scraping data for program: Teknik Elektro
--------------------------------------------------


ERROR: Gagal mengambil halaman pertama untuk Teknik Elektro: HTTPSConnectionPool(host='pta.trunojoyo.ac.id', port=443): Max retries exceeded with url: /c_search/byprod/23 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7b84c6ed2ff0>: Failed to establish a new connection: [Errno 111] Connection refused'))
ERROR: Gagal mengambil URL https://pta.trunojoyo.ac.id/c_search/byprod/23/1: HTTPSConnectionPool(host='pta.trunojoyo.ac.id', port=443): Max retries exceeded with url: /c_search/byprod/23/1 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7b84c6ed2840>: Failed to establish a new connection: [Errno 111] Connection refused'))



Scraping data for program: Sistem Informasi
--------------------------------------------------


ERROR: Gagal mengambil halaman pertama untuk Sistem Informasi: HTTPSConnectionPool(host='pta.trunojoyo.ac.id', port=443): Max retries exceeded with url: /c_search/byprod/31 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7b84c6b3c7a0>: Failed to establish a new connection: [Errno 111] Connection refused'))
ERROR: Gagal mengambil URL https://pta.trunojoyo.ac.id/c_search/byprod/31/1: HTTPSConnectionPool(host='pta.trunojoyo.ac.id', port=443): Max retries exceeded with url: /c_search/byprod/31/1 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7b84c6b3fec0>: Failed to establish a new connection: [Errno 111] Connection refused'))



Scraping data for program: Teknik Mesin
--------------------------------------------------


ERROR: Gagal mengambil halaman pertama untuk Teknik Mesin: HTTPSConnectionPool(host='pta.trunojoyo.ac.id', port=443): Max retries exceeded with url: /c_search/byprod/32 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7b84c6b3dc40>: Failed to establish a new connection: [Errno 111] Connection refused'))
ERROR: Gagal mengambil URL https://pta.trunojoyo.ac.id/c_search/byprod/32/1: HTTPSConnectionPool(host='pta.trunojoyo.ac.id', port=443): Max retries exceeded with url: /c_search/byprod/32/1 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7b84c6ed3c20>: Failed to establish a new connection: [Errno 111] Connection refused'))



Scraping data for program: Teknik Mekatronika
--------------------------------------------------


ERROR: Gagal mengambil halaman pertama untuk Teknik Mekatronika: HTTPSConnectionPool(host='pta.trunojoyo.ac.id', port=443): Max retries exceeded with url: /c_search/byprod/33 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7b84c71e82c0>: Failed to establish a new connection: [Errno 111] Connection refused'))
ERROR: Gagal mengambil URL https://pta.trunojoyo.ac.id/c_search/byprod/33/1: HTTPSConnectionPool(host='pta.trunojoyo.ac.id', port=443): Max retries exceeded with url: /c_search/byprod/33/1 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7b84c71ea180>: Failed to establish a new connection: [Errno 111] Connection refused'))



---
Scraping selesai. Total data yang diambil: 270
Data disimpan ke pta_teknik_all_preprocessing_steps.csv
---


Unnamed: 0,penulis,judul,pembimbing_pertama,pembimbing_kedua,abstrak_indonesia_raw,abstrak_inggris_raw,abstrak_indonesia_clean,abstrak_inggris_clean,abstrak_indonesia_stopwords,abstrak_inggris_stopwords,abstrak_indonesia_stemmed,abstrak_inggris_stemmed,abstrak_indonesia_corrected,abstrak_inggris_corrected,prodi
0,Siliwangi Fitra Rachmawanto S.T.,OPTIMASI PEMILIHAN PORTOFOLIO SAHAM PERUSAHAAN...,"Heri Awalul Ilhamsah S.T., M.T.","Retno Indriartiningtias S.T., M.T.",Portofolio adalah sekumpulan saham yang dimili...,Portofolio is a collection of stock owned by i...,portofolio adalah sekumpulan saham yang dimili...,portofolio is a collection of stock owned by i...,portofolio sekumpulan saham dimiliki investor ...,portofolio is a collection of stock owned by i...,portofolio kumpul saham milik investor tiap sa...,portofolio is a collection of stock owned by i...,portfolio rumpus sham milk investor tip sham p...,portfolio is a collection of stock owned by in...,Teknik Industri
1,AHMAD MAS'UD,PERANCANGAN TATA LETAK FASILITAS LANTAI PRODUK...,"SABARUDIN AKHMAD, S.T., M.T.","SUGENG PURWOKO, S.T., M.T.",PT. ABC merupakan perusahaan yang bergerak dib...,PT. ABC is a company engaged in the manufactur...,pt abc merupakan perusahaan yang bergerak dibi...,pt abc is a company engaged in the manufacture...,pt abc perusahaan bergerak dibidang manufaktur...,pt abc is a company engaged in the manufacture...,pt abc usaha gerak bidang manufaktur kayu prod...,pt abc is a company engaged in the manufacture...,pt arc sara gera biding manufacture kay produk...,pt arc is a company engaged in the manufacture...,Teknik Industri
2,Yulianto Fauzanta,PERUMUSAN STRATEGI BISNIS UD. BUDI JAYA BANGKA...,"Fitri Agustina, S.T., M.T","Retno Indriartiningtias, S.T., M.T",Bangkalan merupakan salah satu kabupaten yang ...,Bangkalan is one of the districts that have th...,bangkalan merupakan salah satu kabupaten yang ...,bangkalan is one of the districts that have th...,bangkalan salah satu kabupaten memiliki potens...,bangkalan is one of the districts that have th...,bangkal salah satu kabupaten milik potensi ala...,bangkal is one of the districts that have the ...,bangka salad sat kabupaten milk potent alarm s...,bangka is one of the districts that have the p...,Teknik Industri
3,M Mundir Muhlisin,USULAN PERBAIKAN UTILITAS RESOURCES PADA LANTA...,Mu'alim ST MT,Sugeng Purwoko ST MT,Simulasi adalah duplikasi atau abstraksi dari ...,Simulation is a duplication or abstraction of ...,simulasi adalah duplikasi atau abstraksi dari ...,simulation is a duplication or abstraction of ...,simulasi duplikasi abstraksi persoalan kehidup...,simulation is a duplication or abstraction of ...,simulasi duplikasi abstraksi soal hidup nyata ...,simulation is a duplication or abstraction of ...,simulate duplikasi abstraksi soul hide nyala m...,simulation is a duplication or abstraction of ...,Teknik Industri
4,Muhibbin,Peningkatan Kepuasan Masyarakat Terhadap Pelay...,Rahmad Hidayat,Retno Indriartiningtias,Kepuasan adalah tingkat perasaan seseorang ter...,Satisfaction is feeling level of someone to se...,kepuasan adalah tingkat perasaan seseorang ter...,satisfaction is feeling level of someone to se...,kepuasan tingkat perasaan seseorang pelayanan ...,satisfaction is feeling level of someone to se...,puas tingkat asa orang layan banding antara ke...,satisfaction is feeling level of someone to se...,puts dingbat asa orang layman banding angara e...,satisfaction is feeling level of someone to se...,Teknik Industri
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,LIDA PUTRI HARIYADI,USULAN PERBAIKAN DENGAN MENGGUNAKAN METODE OMA...,"SUGENG PURWOKO, ST., MT","AGUS SALIM, ST., MT",ABSTRAK\r\nPG.Djombang baru merupakan perusaha...,ABSTRACT\r\nPG.Djombang Baru is a company prod...,abstrak pgdjombang baru merupakan perusahaan y...,abstract pgdjombang baru is a company producin...,abstrak pgdjombang baru perusahaan memproduksi...,abstract pgdjombang baru is a company producin...,abstrak pgdjombang baru usaha produksi gula ne...,abstract pgdjombang baru is a company producin...,abstract pgdjombang bar sara produksi gula neg...,abstract pgdjombang bar is a company producing...,Teknik Industri
266,rycko tyger d'gasha,pengembangan blender bumbu bebek sonkem portab...,"sabarudin akhmad ST,.MT","mu'alim ST,.MT",ABSTRAK\r\nBebek songkem merupakan olahanbebek...,ABSTRACT\r\nSongkem duck is a duck preparation...,abstrak bebek songkem merupakan olahanbebek ya...,abstract songkem duck is a duck preparations a...,abstrak bebek songkem olahanbebek dimasak cara...,abstract songkem duck is a duck preparations a...,abstrak bebek songkem olahanbebek masak cara k...,abstract songkem duck is a duck preparations a...,abstract bebel songkem olahanbebek mask cara k...,abstract songkem duck is a duck preparations a...,Teknik Industri
267,ery ardianto,perancangan dan pengembangan pengering sistem ...,"mu'alim ST,.MT","sabarudin akhmad ST,.MT",ABSTRAK\r\nDalam rangka pemanfaatan cacing tan...,ABSTRACT\r\nIn order to use earthworms as an a...,abstrak dalam rangka pemanfaatan cacing tanah ...,abstract in order to use earthworms as an alte...,abstrak rangka pemanfaatan cacing tanah obat a...,abstract in order to use earthworms as an alte...,abstrak rangka manfaat cacing tanah obat alter...,abstract in order to use earthworms as an alte...,abstract bangka manat facing tana boat alterna...,abstract in order to use earthworms as an alte...,Teknik Industri
268,OGY AHMAD,Strategi Pengembangan Usaha Kapal Wisata AMIRA...,"Dr. Rachmad Hidayat, M.T.","Sabarudin Akhmad, S.T., M.T.",Strategi samudera biru (blue ocean strategy) s...,Blue ocean strategy (blue ocean strategy) as a...,strategi samudera biru blue ocean strategy seb...,blue ocean strategy blue ocean strategy as a s...,strategi samudera biru blue ocean strategy str...,blue ocean strategy blue ocean strategy as a s...,strategi samudera biru blue ocean strategy str...,blue ocean strategy blue ocean strategy as a s...,strategy samudera bird blue ocean strategy str...,blue ocean strategy blue ocean strategy as a s...,Teknik Industri


## Page & Link Keluar PTA

In [4]:
import urllib3
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd

# matikan warning SSL insecure
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def scrape_all_links(base_url, max_pages=50):
    visited = set()
    results = []

    def scrape_page(url):
        try:
            response = requests.get(url, verify=False, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')

            # ambil semua link keluar
            for a in soup.find_all('a', href=True):
                href = a['href'].strip()
                if not href or href == "#":
                    continue
                full_link = urljoin(url, href)
                results.append({
                    "Page": url,
                    "Link Keluar": full_link
                })

            # cari link internal untuk dilanjutkan
            for a in soup.find_all('a', href=True):
                href = a['href'].strip()
                if not href or href == "#":
                    continue
                full_link = urljoin(url, href)
                # hanya ambil link internal (masih di domain utama)
                if full_link.startswith(base_url) and full_link not in visited:
                    visited.add(full_link)
                    if len(visited) < max_pages:
                        scrape_page(full_link)

        except Exception as e:
            print(f"⚠️ Gagal akses {url}: {e}")

    # mulai dari base_url
    visited.add(base_url)
    scrape_page(base_url)

    # rapikan dataframe
    df = pd.DataFrame(results).reset_index(drop=True)
    df.index += 1
    df.insert(0, "No", df.index)
    return df

# contoh penggunaan
url = "https://informatika.trunojoyo.ac.id/"
df_links = scrape_all_links(url, max_pages=30)  # max_pages = batas biar ga infinite loop

df_links
# df_links.to_csv("semua_link.csv", index=False, encoding="utf-8-sig")

Unnamed: 0,No,Page,Link Keluar
1,1,https://informatika.trunojoyo.ac.id/,https://informatika.trunojoyo.ac.id/
2,2,https://informatika.trunojoyo.ac.id/,https://informatika.trunojoyo.ac.id/bidang-min...
3,3,https://informatika.trunojoyo.ac.id/,https://informatika.trunojoyo.ac.id/bidang-min...
4,4,https://informatika.trunojoyo.ac.id/,https://informatika.trunojoyo.ac.id/bidang-min...
5,5,https://informatika.trunojoyo.ac.id/,https://informatika.trunojoyo.ac.id/bidang-min...
...,...,...,...
1704,1704,https://informatika.trunojoyo.ac.id/unit-kegia...,https://ukmfteecom.vercel.app/
1705,1705,https://informatika.trunojoyo.ac.id/unit-kegia...,https://informatika.trunojoyo.ac.id/berita-pro...
1706,1706,https://informatika.trunojoyo.ac.id/unit-kegia...,https://informatika.trunojoyo.ac.id/berita-pro...
1707,1707,https://informatika.trunojoyo.ac.id/unit-kegia...,https://informatika.trunojoyo.ac.id/berita-pro...


## Preprocessing Crawling Berita

In [None]:
import requests
from bs4 import BeautifulSoup
import time
import re
import string
import sys
import pandas as pd
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import random
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from spellchecker import SpellChecker
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from collections import Counter

# --- KONFIGURASI PRA-PEMROSESAN ---
# Kamus untuk normalisasi kata slang/kontraksi bahasa Indonesia
contractions_dict = {
    "gak": "tidak", "ga": "tidak", "nggak": "tidak", "enggak": "tidak", "ngga": "tidak", "gk": "tidak",
    "gue": "saya", "gw": "saya", "gua": "saya", "lu": "kamu", "loe": "kamu",
    "dah": "sudah", "udah": "sudah", "aja": "saja", "ajah": "saja",
    "yg": "yang", "utk": "untuk", "dlm": "dalam", "dr": "dari", "dg": "dengan",
    "jd": "jadi", "krn": "karena", "tp": "tetapi", "tapi": "tetapi",
    "banget": "sekali", "bgt": "sekali", "lg": "lagi",
}

# Inisialisasi Stemmer dari Sastrawi
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Inisialisasi SpellChecker
spell = SpellChecker()

# Inisialisasi Stopword Remover
stopword_factory = StopWordRemoverFactory()
stopword_remover = stopword_factory.create_stop_word_remover()

# --- FUNGSI-FUNGSI BANTUAN SCRAPING ---
def print_progress(kategori, current_page, total_pages):
    """Menampilkan progress bar di konsol."""
    percent = (current_page / total_pages) * 100 if total_pages > 0 else 0
    bar_length = 20
    filled_length = int(bar_length * current_page // total_pages) if total_pages > 0 else 0
    bar = '█' * filled_length + '-' * (bar_length - filled_length)
    sys.stdout.write(f'\r{kategori} - Page {current_page}/{total_pages} [{bar}] {percent:.2f}%')
    sys.stdout.flush()
    if current_page == total_pages:
        sys.stdout.write('\n\n')

def get_session():
    """Membuat sesi permintaan dengan mekanisme percobaan ulang."""
    session = requests.Session()
    retry_strategy = Retry(
        total=5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"],
        backoff_factor=1
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("https://", adapter)
    return session

def get_article_content_and_title(session, url):
    """Mengambil isi artikel dan judul dari URL."""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    }
    try:
        r = session.get(url, headers=headers, timeout=15)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")

        # Mengambil judul artikel
        title = get_article_title(soup)

        # Mencoba beberapa selektor umum untuk konten artikel
        content_selectors = [
            "div.detail-konten",
            "div.news-detail__content",
            "div.itp_bodycontent",
            "div.content-text",
            "div.article-content",
            "div.text_area"
        ]

        paragraphs = []
        for selector in content_selectors:
            content_divs = soup.select(selector)
            if content_divs:
                for div in content_divs:
                    for p in div.find_all("p"):
                        text = p.get_text(strip=True)
                        if text and not text.lower().startswith("baca juga"):
                            paragraphs.append(text)
                if paragraphs:
                    break

        if not paragraphs:
            body_text = soup.find("article")
            if body_text:
                for p in body_text.find_all("p"):
                    text = p.get_text(strip=True)
                    if text and not text.lower().startswith("baca juga"):
                        paragraphs.append(text)

        content = " ".join(paragraphs)
        return title, content
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}", file=sys.stderr)
        return "Judul Tidak Ditemukan", ""

def get_article_title(soup):
    """Mengambil judul artikel dari berbagai kemungkinan lokasi."""
    title_tag = soup.find("h1", class_="detail-title")
    if title_tag:
        return title_tag.get_text(strip=True)

    title_tag = soup.find("h2", class_="media__title")
    if title_tag:
        return title_tag.get_text(strip=True)

    title_tag = soup.find("title")
    if title_tag:
        return title_tag.get_text(strip=True).replace(" - detiknews", "").replace(" - detikfinance", "")

    return "Judul Tidak Ditemukan"

def extract_id(url):
    """
    Ekstrak ID berita dari URL, mengatasi format yang berbeda.
    Mencari pola 'd-' diikuti 7 atau lebih digit angka yang merupakan ID.
    Jika tidak ditemukan, mencoba pola lain.
    """
    # Pola pertama dan paling andal: mencari pola "d-" diikuti ID.
    # Contoh: https://news.detik.com/.../d-7301297/...
    id_match_d = re.search(r"/d-(\d+)", url)
    if id_match_d:
        return id_match_d.group(1)

    # Pola kedua: ID di akhir URL sebelum ekstensi
    # Contoh: https://.../hukum-3456789
    id_match_end = re.search(r"-(\d+)$", url)
    if id_match_end:
        return id_match_end.group(1)

    # Pola ketiga: ID di tengah URL, jika ada
    # Contoh: https://.../artikel/12345.html
    id_match_middle = re.search(r"(\d+)\.html$", url)
    if id_match_middle:
        return id_match_middle.group(1)

    return None

# --- FUNGSI-FUNGSI PRA-PEMROSESAN TERPISAH ---

def clean_base_text(text):
    """
    Fungsi dasar untuk membersihkan teks dari simbol, angka, dll.
    """
    if not isinstance(text, str):
        return ''

    # Menangani kontraksi/slang bahasa Indonesia
    words = text.split()
    expanded_words = [contractions_dict.get(word.lower(), word) for word in words]
    text = ' '.join(expanded_words)

    # Menghilangkan angka dan tanda baca
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def tokenize_text(text):
    """
    # 5. Tokenisasi
    Proses: Tokenisasi. Memisahkan teks menjadi daftar kata (token).
    """
    return text.split()

def remove_stopwords(tokens):
    """
    # 1. Penghapusan kata umum (stopword)
    Proses: Penghapusan Kata Umum (Stopword).
    """
    text = ' '.join(tokens)
    text = stopword_remover.remove(text)
    return text.split()

def apply_stemming_and_lemmatization(tokens):
    """
    # 4. Stemming dan Lematisasi
    Proses: Stemming dan Lematisasi. Mengubah kata menjadi kata dasar.
    """
    text = ' '.join(tokens)
    stemmed_text = stemmer.stem(text)
    return stemmed_text.split()

def correct_spelling(tokens):
    """
    # 3. Cek ejaan pembakuan kata
    Proses: Cek Ejaan Pembakuan Kata. Memperbaiki ejaan kata.
    """
    corrected_words = [spell.correction(word) if spell.correction(word) is not None else word for word in tokens]
    return corrected_words

# --- FUNGSI UTAMA SCRAPING ---

def berita(categories, pages_per_category=10):
    """Fungsi utama untuk melakukan crawling berita dan menyimpan hasilnya."""
    start_time = time.time()
    session = get_session()
    all_articles_data = []
    processed_links = set()

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    }

    base_urls = {
        "politik": "https://news.detik.com/indeks/berita/",
        "hukum": "https://news.detik.com/indeks/berita/",
        "ekonomi": "https://finance.detik.com/indeks/",
        "detikx": "https://news.detik.com/x/indeks/",
        "hiburan": "https://hot.detik.com/indeks/",
        "internasional": "https://news.detik.com/indeks/berita/",
        "sepakbola": "https://sport.detik.com/sepakbola/indeks/",
        "olahraga": "https://sport.detik.com/indeks/",
        "lingkungan": "https://www.detik.com/tag/lingkungan",
        "otomotif": "https://oto.detik.com/indeks"
    }

    categories = list(set(categories))

    for cat in categories:
        current_url = base_urls.get(cat.lower(), f"https://{cat.lower()}.detik.com/indeks/")
        print(f"--- Memulai crawling untuk kategori: {cat} ---")

        for page_count in range(1, pages_per_category + 1):
            url = f"{current_url}?page={page_count}"
            if cat.lower() == "lingkungan":
                url = f"https://www.detik.com/tag/lingkungan?page={page_count}"

            print_progress(cat, page_count, pages_per_category)

            try:
                r = session.get(url, headers=headers, timeout=15)
                r.raise_for_status()
                soup = BeautifulSoup(r.text, "html.parser")
                article_links = soup.select("a.media__link")

                for a in article_links:
                    link = a["href"]
                    if link in processed_links:
                        continue
                    processed_links.add(link)

                    berita_id = extract_id(link)

                    title, content = get_article_content_and_title(session, link)

                    if content:
                        # --- Rangkaian Proses Pra-pemrosesan Teks ---
                        # Membersihkan teks dasar dan menghilangkan tanda baca (punctuation)
                        clean_text = clean_base_text(content)

                        # Memisahkan teks menjadi kata-kata (tokenisasi)
                        tokens = tokenize_text(clean_text)

                        # Menghapus kata-kata umum (stopwords) yang tidak memiliki makna
                        stopwords_removed = remove_stopwords(tokens)

                        # Mengubah kata menjadi bentuk dasarnya (stemming dan lematisasi)
                        stemmed_tokens = apply_stemming_and_lemmatization(stopwords_removed)

                        # Memperbaiki ejaan kata
                        corrected_tokens = correct_spelling(stemmed_tokens)

                        # Menghitung frekuensi kata dari token hasil stemming
                        word_frequency_indonesia = Counter(stemmed_tokens)

                        print(f"\n--- Data {len(all_articles_data) + 1} ---")
                        print(f"ID Berita: {berita_id}")
                        print(f"Judul: {title}")
                        print(f"Abstrak (Raw): {content}")
                        print(f"Abstrak (Clean): {clean_text}")
                        print(f"Abstrak (Stopwords Removed): {' '.join(stopwords_removed)}")
                        print(f"Abstrak (Stemmed): {' '.join(stemmed_tokens)}")
                        print(f"Abstrak (Corrected): {' '.join(corrected_tokens)}")

                        print("\n--- Tokenisasi (Perhitungan Jumlah Kata) ---")
                        print(f"Frekuensi Kata (Indonesia): {word_frequency_indonesia}")
                        print("-------------------------------------------\n")

                        all_articles_data.append({
                            "id_berita": berita_id,
                            "judul_berita": title,
                            "isi_berita_original": content,
                            "isi_berita_diproses": ' '.join(corrected_tokens),
                            "kategori_berita": cat
                        })
                    time.sleep(random.uniform(1, 3))
            except requests.exceptions.RequestException as e:
                print(f"\n❌ Gagal mengakses {url}: {e}", file=sys.stderr)
                break

    df = pd.DataFrame(all_articles_data)
    df.to_csv("detik_berita_10_pages.csv", index=False, encoding="utf-8-sig")

    end_time = time.time()
    elapsed = int(end_time - start_time)
    jam, sisa = divmod(elapsed, 3600)
    menit, detik = divmod(sisa, 60)

    print("\n✅ Seluruh data berhasil dikumpulkan!")
    print(f"📊 Total entri: {len(df)}")
    print(f"⏱️ Waktu eksekusi: {jam} jam {menit} menit {detik} detik")

    print("\nBerikut adalah 5 entri pertama yang berhasil dikumpulkan:")
    print(df.head())

    return df

if __name__ == '__main__':
    categories = ["politik", "hukum", "ekonomi", "lingkungan", "hiburan", "internasional", "otomotif", "olahraga", "sepakbola"]
    berita(categories, pages_per_category=5)

--- Memulai crawling untuk kategori: internasional ---
internasional - Page 1/5 [████----------------] 20.00%
--- Data 1 ---
ID Berita: 8116996
Judul: Kemensos Santuni Korban Unjuk Rasa Tigaraksa, Komitmen Dampingi Keluarga
Abstrak (Raw): Wakil Menteri Sosial (Wamensos), Agus Jabo Priyono menyerahkan santunan senilai Rp 15 juta kepada keluarga mendiang Andika Lutfi Falah (16), korban unjuk rasa di Tigaraksa, Kabupaten Tangerang, Banten. Andika merupakan anak kedua dari pasangan Abdul Gofur, penjual kopi keliling, dan Sofia, ibu rumah tangga. Ia wafat setelah mendapat perawatan intensif di RS Dr. Mintohardjo akibat luka benda tumpul yang dialaminya saat mengikuti aksi. Selain santunan, Kemensos juga menyalurkan bantuan sembako senilai Rp 500.000 berupa 10 kg beras, 2 liter minyak goreng, 2 kaleng sarden besar, 2 krat telur, 2 kaleng kornet, 1 kg gula pasir, 1 kotak teh, 1 bungkus kopi, 1 bungkus kecap, dan 10 bungkus mie instan. SCROLL TO CONTINUE WITH CONTENT Dalam kesempatan tersebut,