In [5]:
import joblib
import requests
from io import BytesIO

def load_joblib_from_github(url):
    try:
        response = requests.get(url)
        response.raise_for_status() # Cek apakah URL valid/bisa diakses

        content = response.content

        # Deteksi Git LFS pointer (GitHub tidak mengirim file asli jika LFS aktif tanpa kuota)
        if content.startswith(b"version https://git-lfs.github.com"):
            return "ERROR_LFS"

        # Gunakan joblib untuk me-load dari buffer memori (BytesIO)
        return joblib.load(BytesIO(content))

    except Exception as e:
        print(f"Gagal memuat file dari: {url}\nError: {e}")
        return None

# ===== URL (Pastikan URL "Raw") =====
base_url = "https://raw.githubusercontent.com/asepsr37/data-and-machine-learning/main/sentiment-analysis/lets-get-rich/model-sentiment/"

url_model = base_url + "model_sentiment_lgr.pkl"
url_tfidf = base_url + "tfidf_vectorizer_lgr.pkl"
url_le    = base_url + "label_encoder_lgr.pkl"

# ===== PROSES LOAD =====
model = load_joblib_from_github(url_model)
tfidf = load_joblib_from_github(url_tfidf)
label_encoder = load_joblib_from_github(url_le)

# Cek apakah ada masalah Git LFS
if model == "ERROR_LFS":
    print("⚠️ WARNING: File di GitHub terdeteksi sebagai Git LFS pointer.")
    print("GitHub Raw tidak bisa mendownload file LFS yang besar secara langsung.")
    print("Solusi: Upload file ke Google Drive atau hosting lain yang mendukung direct download.")
else:
    print("✅ Berhasil: Semua komponen siap digunakan!")

✅ Berhasil: Semua komponen siap digunakan!


Alur Prediksi Data Baru
- Input: Teks mentah dari user.

1. tfidf.pkl (The Translator): Teks diubah menjadi deretan angka. Tanpa ini, model "buta" karena ia hanya bisa membaca matematika, bukan kata-kata.

2. model.pkl (The Brain): Angka tadi diproses oleh otak Logistic Regression. Hasil keluarnya adalah angka indeks (misal: 0).

3. le.pkl (The Spokesperson): Angka 0 tadi diterjemahkan kembali menjadi kata-kata yang kita mengerti (misal: "negative").

In [6]:
import re
import string

# 1. Fungsi Cleaning (Wajib sama dengan saat training)
def cleaningtext(text):
    text = re.sub(r"@[A-Za-z0-9]+", "", text)       # Hapus mention
    text = re.sub(r"#[A-Za-z0-9]+", "", text)       # Hapus hashtag
    text = re.sub(r"RT[\s]", "", text)              # Hapus RT
    text = re.sub(r"http\S+", "", text)             # Hapus link
    text = re.sub(r"[0-9]+", "", text)              # Hapus angka
    text = re.sub(r"[^\w\s]", "", text)             # Hapus simbol/emoji
    text = text.replace("\n", " ")                  # Hapus baris baru
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.strip().lower()                     # Case folding
    return text

# 2. Fungsi Prediksi Utama
def predict_lgr_sentiment(user_review):
    if model is None or tfidf is None or label_encoder is None:
        return "Gagal: Model belum dimuat sempurna."

    # A. Preprocessing
    clean_text = cleaningtext(user_review)

    # B. Transformasi ke TF-IDF
    # Kita masukkan ke dalam list [clean_text] karena TF-IDF butuh iterable
    vectorized_text = tfidf.transform([clean_text])

    # C. Prediksi Indeks
    prediction_idx = model.predict(vectorized_text)

    # D. Decode Indeks ke Label Teks
    label = label_encoder.inverse_transform(prediction_idx)[0]

    # E. (Opsional) Ambil Skor Probabilitas
    prob = model.predict_proba(vectorized_text).max()

    return label, prob

In [10]:
# ===== COBA SISTEM =====
print("-" * 30)
input_user = "This Games is so bad, i dont want play again"
hasil, skor = predict_lgr_sentiment(input_user)

print(f"Ulasan: {input_user}")
print(f"Hasil Analisis: {hasil.upper()}")
print(f"Tingkat Keyakinan Model: {skor*100:.2f}%")
print("-" * 30)

------------------------------
Ulasan: This Games is so bad, i dont want play again
Hasil Analisis: NEGATIVE
Tingkat Keyakinan Model: 82.41%
------------------------------
