# **1. Import Library**

In [1]:
import pandas as pd
import numpy as np
import joblib
import pickle
import re
import warnings
warnings.filterwarnings('ignore')

# Preprocessing
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# Deep Learning
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec

# XGBoost
from xgboost import XGBClassifier


# **2. Load Models & Komponen**

In [2]:
try:
    # Model 1: Random Forest
    vectorizer = joblib.load('saved_models/tfidf_vectorizer.pkl')
    rf_model = joblib.load('saved_models/random_forest_model.pkl')
    
    # Model 2: LSTM
    w2v_model = Word2Vec.load('saved_models/word2vec_model.bin')
    lstm_model = load_model('saved_models/lstm_model.h5')
    label_encoder = joblib.load('saved_models/label_encoder.pkl')
    with open('saved_models/lstm_config.pkl', 'rb') as f:
        lstm_config = pickle.load(f)
    
    # Model 3: XGBoost
    xgb_model = XGBClassifier()
    xgb_model.load_model('saved_models/xgboost_model.json')
    le_xgb = joblib.load('saved_models/label_encoder_xgb.pkl')
    
except Exception as e:
    print(f"❌ Error loading models: {e}")
    exit(1)

print()






# **3. Preprocessing Functions**

In [3]:
def preprocess_text(text):
    text = str(text).lower()
    
    # Hapus emoji
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"
        "\U0001F300-\U0001F5FF"
        "\U0001F680-\U0001F6FF"
        "\U0001F1E0-\U0001F1FF"
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "\U0001F900-\U0001F9FF"
        "\U0001FA00-\U0001FA6F"
        "\U0001FA70-\U0001FAFF"
        "\U00002600-\U000026FF"
        "\U00002700-\U000027BF"
        "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)
    text = re.sub(r'[:;=][oO\-]?[D\)\]\(\]/\\OpP]', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Initialize preprocessing tools
stopword_remover = StopWordRemoverFactory().create_stop_word_remover()
stemmer = StemmerFactory().create_stemmer()

def full_preprocess(text):
    text = preprocess_text(text)
    text = stopword_remover.remove(text)
    text = stemmer.stem(text)
    return text

def text_to_sequence(text, w2v_model, max_length):
    tokens = text.split()
    sequence = []
    for word in tokens:
        if word in w2v_model.wv:
            sequence.append(w2v_model.wv.key_to_index[word])
    padded = pad_sequences([sequence], maxlen=max_length, padding='post')
    return padded



# **4. Fungsi Prediksi**

In [4]:
def predict_rf(text):
    """Prediksi dengan Random Forest"""
    clean_text = full_preprocess(text)
    tfidf = vectorizer.transform([clean_text])
    prediction = rf_model.predict(tfidf)[0]
    proba = rf_model.predict_proba(tfidf)[0]
    return prediction, proba

def predict_lstm(text):
    """Prediksi dengan LSTM"""
    clean_text = full_preprocess(text)
    sequence = text_to_sequence(clean_text, w2v_model, lstm_config['max_length'])
    proba = lstm_model.predict(sequence, verbose=0)[0]
    prediction_idx = np.argmax(proba)
    prediction = label_encoder.classes_[prediction_idx]
    return prediction, proba

def predict_xgb(text):
    """Prediksi dengan XGBoost"""
    clean_text = full_preprocess(text)
    tfidf = vectorizer.transform([clean_text])
    prediction_encoded = xgb_model.predict(tfidf)[0]
    prediction = le_xgb.inverse_transform([prediction_encoded])[0]
    proba = xgb_model.predict_proba(tfidf)[0]
    return prediction, proba

def predict_all(text):
    """Prediksi dengan semua model"""
    print(f"\nInput Text: '{text}'")
    print(f"Clean Text: '{full_preprocess(text)}'")
    print("\n" + "─" * 80)
    
    # Random Forest
    rf_pred, rf_proba = predict_rf(text)
    print(f"\nRandom Forest:")
    print(f"Prediction: {rf_pred}")
    print(f"Confidence: {max(rf_proba):.4f} ({max(rf_proba)*100:.2f}%)")
    print(f"Probabilities: negatif={rf_proba[0]:.4f}, netral={rf_proba[1]:.4f}, positif={rf_proba[2]:.4f}")
    
    # LSTM
    lstm_pred, lstm_proba = predict_lstm(text)
    print(f"\nLSTM:")
    print(f"Prediction: {lstm_pred}")
    print(f"Confidence: {max(lstm_proba):.4f} ({max(lstm_proba)*100:.2f}%)")
    print(f"Probabilities: negatif={lstm_proba[0]:.4f}, netral={lstm_proba[1]:.4f}, positif={lstm_proba[2]:.4f}")
    
    # XGBoost
    xgb_pred, xgb_proba = predict_xgb(text)
    print(f"\nXGBoost:")
    print(f"Prediction: {xgb_pred}")
    print(f"Confidence: {max(xgb_proba):.4f} ({max(xgb_proba)*100:.2f}%)")
    print(f"Probabilities: negatif={xgb_proba[0]:.4f}, netral={xgb_proba[1]:.4f}, positif={xgb_proba[2]:.4f}")
    
    # Voting
    predictions = [rf_pred, lstm_pred, xgb_pred]
    final_prediction = max(set(predictions), key=predictions.count)
    
    print("\n" + "─" * 80)
    print(f"\nVOTING RESULT: {final_prediction}")
    print(f"(RF: {rf_pred}, LSTM: {lstm_pred}, XGBoost: {xgb_pred})")
    print()
    
    return {
        'rf': {'prediction': rf_pred, 'confidence': max(rf_proba)},
        'lstm': {'prediction': lstm_pred, 'confidence': max(lstm_proba)},
        'xgb': {'prediction': xgb_pred, 'confidence': max(xgb_proba)},
        'voting': final_prediction
    }


# **5. Testing**

In [5]:
if __name__ == "__main__":
    print("=" * 80)
    print("TESTING INFERENCE")
    print("=" * 80)
    
    # Test samples
    test_texts = [
        "Aplikasi gojek sangat bagus dan mudah digunakan, driver ramah sekali!",
        "Jelek banget aplikasinya sering error dan driver lama",
        "Biasa aja sih, tidak ada yang istimewa",
        "Pelayanan cepat dan harga terjangkau, recommended!",
        "Kecewa dengan layanannya, pesanan sering dibatalkan"
    ]
    
    print(f"\nMelakukan prediksi untuk {len(test_texts)} sampel:\n")
    
    all_results = [] 

    for i, text in enumerate(test_texts, 1):
        print(f"\n{'='*80}")
        print(f"SAMPLE {i}/{len(test_texts)}")
        print("=" * 80)
        result = predict_all(text)
        
        # Ambil hasil tiap model
        rf_pred = result['rf']['prediction']
        rf_conf = result['rf']['confidence']
        lstm_pred = result['lstm']['prediction']
        lstm_conf = result['lstm']['confidence']
        xgb_pred = result['xgb']['prediction']
        xgb_conf = result['xgb']['confidence']
        voting = result['voting']
        
        all_results.append({
            'Text': text[:60] + '...' if len(text) > 60 else text,
            'RF_Pred': rf_pred,
            'RF_Conf': rf_conf,
            'LSTM_Pred': lstm_pred,
            'LSTM_Conf': lstm_conf,
            'XGB_Pred': xgb_pred,
            'XGB_Conf': xgb_conf,
            'Voting': voting
        })

    # Buat DataFrame ringkasan hasil prediksi
    df_results = pd.DataFrame(all_results)


TESTING INFERENCE

Melakukan prediksi untuk 5 sampel:


SAMPLE 1/5

Input Text: 'Aplikasi gojek sangat bagus dan mudah digunakan, driver ramah sekali!'
Clean Text: 'aplikasi gojek sangat bagus mudah guna driver ramah sekali'

────────────────────────────────────────────────────────────────────────────────

Random Forest:
Prediction: positif
Confidence: 0.6344 (63.44%)
Probabilities: negatif=0.1663, netral=0.1994, positif=0.6344

LSTM:
Prediction: positif
Confidence: 0.9931 (99.31%)
Probabilities: negatif=0.0069, netral=0.0000, positif=0.9931

XGBoost:
Prediction: positif
Confidence: 0.9999 (99.99%)
Probabilities: negatif=0.0000, netral=0.0000, positif=0.9999

────────────────────────────────────────────────────────────────────────────────

VOTING RESULT: positif
(RF: positif, LSTM: positif, XGBoost: positif)


SAMPLE 2/5

Input Text: 'Jelek banget aplikasinya sering error dan driver lama'
Clean Text: 'jelek banget aplikasi sering error driver lama'

────────────────────────────────────

In [6]:
df_results.head()

Unnamed: 0,Text,RF_Pred,RF_Conf,LSTM_Pred,LSTM_Conf,XGB_Pred,XGB_Conf,Voting
0,Aplikasi gojek sangat bagus dan mudah digunaka...,positif,0.634364,positif,0.99307,positif,0.999949,positif
1,Jelek banget aplikasinya sering error dan driv...,negatif,0.555778,negatif,0.999615,negatif,0.999961,negatif
2,"Biasa aja sih, tidak ada yang istimewa",netral,0.397864,negatif,0.51681,netral,0.956067,netral
3,"Pelayanan cepat dan harga terjangkau, recommen...",positif,0.459256,positif,0.998546,positif,0.949297,positif
4,"Kecewa dengan layanannya, pesanan sering dibat...",negatif,0.493346,negatif,0.931966,negatif,0.999019,negatif
