<a href="https://colab.research.google.com/github/arafiramadhanmaulana/tubesmlopskelompok3rb/blob/main/kode_mlops.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q -U bitsandbytes accelerate transformers pymupdf

import os
import fitz
import re
import json
import pandas as pd
from datetime import datetime
from google.colab import drive
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

drive.mount('/content/drive')

FOLDER_PATH = '/content/drive/MyDrive/modul_praktikum_sains_data'

model_id = "Qwen/Qwen2.5-1.5B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

print(f"Sedang memuat AI {model_id}...")
try:
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )
except Exception as e:
    print("‚ùå Error memuat model. Pastikan Anda menggunakan Runtime T4 GPU!")
    raise e

ABBREVIATION_MAP = {
    "Data Mining": "DM",
    "Pemodelan Stokastik": "PS",
    "Pergudangan Data": "PD",
    "Komputasi Paralel": "KP",
    "Analisis Data Statistik": "ADS",
    "Teknologi Basis Data": "TBD",
    "Basis Data": "BD",
    "Algoritma Pemrograman": "AP",
    "Deep Learning": "DL",
    "Machine Learning": "ML"
}

DAFTAR_MATKUL_AI = list(ABBREVIATION_MAP.keys())

def get_pdf_content(file_path):
    """Membaca teks halaman awal PDF untuk konteks AI"""
    try:
        doc = fitz.open(file_path)
        text = ""
        for i in range(min(2, len(doc))):
            text += doc[i].get_text()
        doc.close()
        return re.sub(r'\s+', ' ', text).strip()[:2500]
    except:
        return ""

def ask_ai(text_content):
    """Mengirim Prompt ke AI untuk ekstraksi JSON"""

    system_prompt = "Kamu adalah asisten akademik ahli Sains Data. Tugasmu mengekstrak metadata modul praktikum."

    user_prompt = f"""
    Analisis teks modul praktikum berikut:
    ---
    {text_content}
    ---

    Ekstrak informasi berikut dalam format JSON Valid:
    1. "mata_kuliah_full": Pilih SATU yang paling relevan dari daftar ini: {DAFTAR_MATKUL_AI}.
    2. "judul_topik": Judul spesifik modul ini. (HAPUS kata 'Modul', 'Praktikum', 'Bab', 'Percobaan', atau Angka urutan di awal. Ambil inti judul teknisnya saja).
    3. "deskripsi": Buat ringkasan DETIL (3-4 kalimat). Jelaskan TUJUAN praktikum, METODE/ALGORITMA yang digunakan, dan TOOLS/LIBRARY yang dipakai (jika ada). Gunakan Bahasa Indonesia formal.

    Jawab HANYA JSON.
    """

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    model_inputs = tokenizer([text], return_tensors="pt").to("cuda")

    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=450,
        temperature=0.1,
        do_sample=True
    )

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return response.split(user_prompt)[-1] if user_prompt in response else response

def parse_json_response(response):
    """Membersihkan output AI menjadi Dictionary Python"""
    try:
        match = re.search(r'\{.*\}', response, re.DOTALL)
        if match: return json.loads(match.group(0))
    except: pass
    return None

if not os.path.exists(FOLDER_PATH):
    print(f"‚ùå Path Salah: {FOLDER_PATH}")
else:
    files = [os.path.join(FOLDER_PATH, f) for f in os.listdir(FOLDER_PATH) if f.lower().endswith(".pdf")]
    results = []

    print(f"\nüöÄ Memproses {len(files)} file PDF dengan AI...\n")

    for f_path in files:
        filename = os.path.basename(f_path)

        raw_text = get_pdf_content(f_path)
        if not raw_text: continue

        ai_raw = ask_ai(raw_text)
        data = parse_json_response(ai_raw)

        if data:
            full_mk = data.get("mata_kuliah_full", "Lainnya")

            l1_abbr = ABBREVIATION_MAP.get(full_mk, "OTH")

            l2_topic = data.get("judul_topik", "Topik Umum")
            desc = data.get("deskripsi", "Tidak ada deskripsi.")
        else:
            print(f"‚ö†Ô∏è Gagal parsing JSON: {filename}")
            l1_abbr, l2_topic, desc = "ERR", "Manual Check", "Error AI Response"

        print(f"[‚úÖ OK] {filename}")
        print(f"   L1 (Code) : {l1_abbr}")
        print(f"   L2 (Topik): {l2_topic}")
        print("-" * 40)

        results.append({
            'file_name': filename,
            'layer_1_code': l1_abbr,
            'layer_2_topic': l2_topic,
            'description': desc,
            'file_path': f_path,
            'created_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'updated_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        })

    df = pd.DataFrame(results)
    output_csv = '/content/drive/MyDrive/indexing_final_db.csv'
    df.to_csv(output_csv, index=False)
    print(f"\nüíæ Selesai! Data tersimpan di: {output_csv}")

[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.0/44.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.4/59.4 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m12.0/12.0 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m24.1/24.1 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
Sedang memuat AI Qwen/Qwen2.5-1.5B-Instruct...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

üöÄ Memulai indexing dari: /content/drive/MyDrive/modul_praktikum_sains_data

üìÇ Folder: Pergudangan Data | üìÑ File: MODUL PRAKTIKUM 2 Pergudangan Data.pdf


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


   ‚úÖ Topik: Desain Konseptual Pergudangan Data
--------------------------------------------------
üìÇ Folder: Pergudangan Data | üìÑ File: MODUL PRAKTIKUM 3 Pergudangan Data.pdf
   ‚úÖ Topik: Desain Logikal Gudang Data
--------------------------------------------------
üìÇ Folder: Pergudangan Data | üìÑ File: 1. MODUL PRATIKUM 4 - PHYSICAL DESIGN.pdf
   ‚úÖ Topik: Design Fisik Pergudangan Data
--------------------------------------------------
üìÇ Folder: Pergudangan Data | üìÑ File: dok-misi-dw.pdf
   ‚úÖ Topik: Misi 2: Desain Fisikal dan Development
--------------------------------------------------
üìÇ Folder: Pemodelan Stokastik | üìÑ File: modul 2-Pemodelan-Stokastik.pdf
   ‚úÖ Topik: Rantai Markov
--------------------------------------------------
üìÇ Folder: Pemodelan Stokastik | üìÑ File: Modul 1-Pemodelan-Stokastik.pdf
   ‚úÖ Topik: Proses Stokastik
--------------------------------------------------
üìÇ Folder: Pemodelan Stokastik | üìÑ File: Modul 4 - Proses Poi

In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/indexing_matkul_final.csv")
df

Unnamed: 0,layer_1_code,subject_name,layer_2_topic,description,file_name,file_path,created_at
0,PD,Pergudangan Data,Desain Konseptual Pergudangan Data,Modul ini membahas tahap desain konseptual dal...,MODUL PRAKTIKUM 2 Pergudangan Data.pdf,/content/drive/MyDrive/modul_praktikum_sains_d...,2025-11-29 13:36:26
1,PD,Pergudangan Data,Desain Logikal Gudang Data,Modul ini membahas tahapan desain logis dalam ...,MODUL PRAKTIKUM 3 Pergudangan Data.pdf,/content/drive/MyDrive/modul_praktikum_sains_d...,2025-11-29 13:36:37
2,PD,Pergudangan Data,Design Fisik Pergudangan Data,Modul praktikum ini mengajarkan mahasiswa tent...,1. MODUL PRATIKUM 4 - PHYSICAL DESIGN.pdf,/content/drive/MyDrive/modul_praktikum_sains_d...,2025-11-29 13:36:47
3,PD,Pergudangan Data,Misi 2: Desain Fisikal dan Development,Misi ini melibatkan desain fisik database dan ...,dok-misi-dw.pdf,/content/drive/MyDrive/modul_praktikum_sains_d...,2025-11-29 13:37:01
4,PS,Pemodelan Stokastik,Rantai Markov,Modul ini membahas model matematika yang melib...,modul 2-Pemodelan-Stokastik.pdf,/content/drive/MyDrive/modul_praktikum_sains_d...,2025-11-29 13:37:08
...,...,...,...,...,...,...,...
73,TBD,Teknologi Basis Data,Document Database,Modul ini membahas konsep dan aplikasi dari da...,Modul 9 Praktikum - Document Database v2 (2) (...,/content/drive/MyDrive/modul_praktikum_sains_d...,2025-11-29 13:51:05
74,TBD,Teknologi Basis Data,Transaksi Basis Data,Modul ini membahas konsep dan urgensi transaks...,Modul 4 Praktikum.pdf,/content/drive/MyDrive/modul_praktikum_sains_d...,2025-11-29 13:51:14
75,TBD,Teknologi Basis Data,Peta Animasi Spasio-Temporal dengan Plugin Tim...,"Dalam modul praktikum ini, mahasiswa akan bela...",Modul 6 Praktikum - Spatial Temporal Database ...,/content/drive/MyDrive/modul_praktikum_sains_d...,2025-11-29 13:51:29
76,TBD,Teknologi Basis Data,Backup and Recovery in MySQL,"Dalam modul ini, Anda belajar tentang konsep b...",Modul 7 Praktikum.pdf,/content/drive/MyDrive/modul_praktikum_sains_d...,2025-11-29 13:51:39
