In [1]:
# ===================================================================
# LANGKAH 1: INSTALASI LIBRARY
# Menjalankan ini akan menginstal EasyOCR dan library lain yang diperlukan.
# ===================================================================
!pip install easyocr

Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->easyocr)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->easyocr)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch->easyocr)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch->easyocr)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch->easyocr)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch->easyocr)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-nvjitlink-cu12==12.4.1

In [2]:
# ===================================================================
# LANGKAH 2: IMPORT LIBRARY
# ===================================================================
import os
import pandas as pd
import easyocr
from PIL import Image
import torch
import warnings
# Mengabaikan peringatan yang tidak relevan dari PIL
warnings.filterwarnings("ignore", category=UserWarning, module="PIL")

In [3]:
def process_directory_easyocr(directory, label):
    """
    Memproses semua gambar dalam sebuah direktori menggunakan EasyOCR,
    mengekstrak teks, dan mengembalikan DataFrame.
    """
    file_names = []
    ocr_texts = []
    classes = []

    # --- Persiapan: Hitung total file untuk menunjukkan progres ---
    image_files_to_process = []
    print(f"Mencari file gambar di: {directory}")
    for root, _, filenames in os.walk(directory):
        for filename in filenames:
            if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
                image_files_to_process.append(os.path.join(root, filename))
    
    total_files = len(image_files_to_process)
    if total_files == 0:
        print("-> Tidak ada file gambar yang ditemukan.")
        return pd.DataFrame({'File Name': [], 'Extracted Text': [], 'Class': []})
    
    print(f"-> Total gambar yang akan diproses: {total_files}")

    # --- Inisialisasi EasyOCR Reader ---
    # Inisialisasi ini bisa memakan waktu beberapa saat saat pertama kali dijalankan
    # karena perlu mengunduh model bahasa.
    # Kita akan mendeteksi bahasa Indonesia ('id') dan Inggris ('en').
    use_gpu = torch.cuda.is_available()
    print(f"-> Menggunakan device: {'GPU' if use_gpu else 'CPU'}")
    reader = easyocr.Reader(['id', 'en'], gpu=use_gpu)

    # --- Proses Setiap File Gambar ---
    for i, file_path in enumerate(image_files_to_process):
        filename = os.path.basename(file_path)
        
        try:
            # Tampilkan progres saat ini
            print(f"Memproses file {i + 1}/{total_files}: {filename}")
            
            file_names.append(filename)

            # EasyOCR dapat menerima path file secara langsung
            ocr_result = reader.readtext(file_path, detail=1, paragraph=False)
            
            # Cek apakah EasyOCR menemukan teks
            if ocr_result:
                # Gabungkan semua potongan teks yang ditemukan menjadi satu baris,
                # dipisahkan oleh spasi.
                extracted_text = ' '.join([res[1] for res in ocr_result])
                ocr_texts.append(extracted_text.strip())
            else:
                # Jika tidak ada teks yang terdeteksi, tambahkan string kosong
                ocr_texts.append("")

            # Tambahkan label kelas
            classes.append(label)
            
        except Exception as e:
            print(f"!! ERROR saat memproses {filename}: {e}")
            # Jika terjadi error, catat pesan error agar mudah dilacak
            ocr_texts.append(f"ERROR: {e}")
            classes.append(label)
            continue

    return pd.DataFrame({'File Name': file_names, 'Extracted Text': ocr_texts, 'Class': classes})

In [4]:
# ===================================================================
# LANGKAH 4: EKSEKUSI UTAMA
# ===================================================================

# Tentukan path ke direktori gambar Anda
judi_dir = '/kaggle/input/gamblingdet-id/situsjudiid-full/judi'
normal_dir = '/kaggle/input/gamblingdet-id/situsjudiid-full/non-judi'

# Proses kedua direktori menggunakan fungsi baru
print("\n--- Memproses Direktori Judi ---")
judi_df = process_directory_easyocr(judi_dir, 'judi')

print("\n--- Memproses Direktori Non-Judi ---")
normal_df = process_directory_easyocr(normal_dir, 'non-judi')

# Gabungkan kedua DataFrame menjadi satu
final_df = pd.concat([judi_df, normal_df], ignore_index=True)

# Tampilkan beberapa baris pertama dan terakhir dari hasil gabungan
print("\n--- Hasil Gabungan (Contoh) ---")
if not final_df.empty:
    print(final_df.head().to_string())
    print("...")
    print(final_df.tail().to_string())
else:
    print("DataFrame akhir kosong.")

print(f"\nTotal baris data yang dihasilkan: {len(final_df)}")

# Simpan DataFrame akhir ke file CSV
output_path = '/kaggle/working/text_ocr_easyocr.csv'
final_df.to_csv(output_path, index=False, encoding='utf-8')

print(f"\nProses selesai. Hasil telah disimpan ke: {output_path}")



--- Memproses Direktori Judi ---
Mencari file gambar di: /kaggle/input/gamblingdet-id/situsjudiid-full/judi
-> Total gambar yang akan diproses: 1950
-> Menggunakan device: GPU
Memproses file 1/1950: ahoyamigo.com_home.png
Memproses file 2/1950: asgard789.bet.png
Memproses file 3/1950: bola442.monster.png
Memproses file 4/1950: maniac-ihokibet.com_page1.png
Memproses file 5/1950: ketohour.com_page4.png
Memproses file 6/1950: agenclassicgames.webflow.io.png
Memproses file 7/1950: bolakoh.com.png
Memproses file 8/1950: manjabet1.life_page4.png
Memproses file 9/1950: aksessingawin.us_home.png
Memproses file 10/1950: air-pegunungan.pro_page5.png
Memproses file 11/1950: aguaonline.net_home.png
Memproses file 12/1950: nikeshopjapan.com.png
Memproses file 13/1950: boladoki.com.png
Memproses file 14/1950: angkosimba4d.com_home.png
Memproses file 15/1950: bolaidp88.site.png
Memproses file 16/1950: judetabet89.lol_home.png
Memproses file 17/1950: lulu88casino.com_page5.png
Memproses file 18/1950

In [5]:
final_df.head()

Unnamed: 0,File Name,Extracted Text,Class
0,ahoyamigo.com_home.png,KASKUSTOTO LOGIN KASKUSTOTO oiocom KASKUSTOTO ...,judi
1,asgard789.bet.png,SUCKBET kunusn Tusluju Unajv ]acipısı aüns I...,judi
2,bola442.monster.png,AsF User Name Password Code 6636 LOGIN Lela442...,judi
3,maniac-ihokibet.com_page1.png,Download APK Live Chat Hubungı Kamı Lupa Kata ...,judi
4,ketohour.com_page4.png,ketohour Beranda 7 @ Q Subscribe Beranda Situs...,judi
