# Training moden dengan tidak menggunakan spark

In [1]:
from Util.prep_data import process_slang, prep
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [2]:
#  import data cahtbo json
with open('./Cache/chatbot.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Import data chatbot yang telah di preprocessing
df_chatbot = pd.read_csv('Data/prep-data-chatbot.csv')

In [3]:
# membuat tf-idf vectorizer
vectorizer = TfidfVectorizer(min_df=1) 

# Convert the list strings in 'prep' column to proper strings
df_chatbot['prep_text'] = df_chatbot['prep'].apply(lambda x: ' '.join(eval(x)) if isinstance(x, str) else ' '.join(x))

# fit dan transform data prep dari df_chatbot untuk membuat tf-idf matrix
tfidf_matrix = vectorizer.fit_transform(df_chatbot['prep_text'])

# mengambil nama-nama fitur (kata-kata) yang digunakan
feature_names = vectorizer.get_feature_names_out()

# membuat dataframe dari tf-idf matrix untuk visualisasi
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
tfidf_df['tag'] = df_chatbot['tag']

In [4]:
# # Simpan model TfidfVectorizer dan matrix
# import pickle

# # Simpan vectorizer
# with open('Util/tfidf_vectorizer.pkl', 'wb') as f:
#    pickle.dump(vectorizer, f)

# # Simpan tfidf matrix 
# with open('Util/tfidf_matrix.pkl', 'wb') as f:
#    pickle.dump(tfidf_matrix, f)

In [5]:
# Extract features (X) and labels (y)
X = tfidf_matrix
y = df_chatbot['tag']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=30)

In [6]:
# Inisialisasi model Logistic Regression
lr_model = LogisticRegression()

# Melatih model menggunakan data training
lr_model.fit(X_train, y_train)

# Prediksi menggunakan data testing
y_pred_lr = lr_model.predict(X_test)

# Menghitung dan menampilkan akurasi model
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print("\nAkurasi Logistic Regression:", accuracy_lr)

# Menampilkan laporan klasifikasi detail
print("\nLaporan Klasifikasi Logistic Regression:")
print(classification_report(y_test, y_pred_lr))


Akurasi Logistic Regression: 0.9835390946502057

Laporan Klasifikasi Logistic Regression:
                      precision    recall  f1-score   support

           about_car       0.97      0.98      0.98        62
       about_chatbot       1.00      0.97      0.99        34
  car_recommendation       0.95      0.97      0.96        39
             goodbye       1.00      1.00      1.00        26
info_manual_otomatic       1.00      1.00      1.00        44
       rekomen_mobil       1.00      0.97      0.99        38

            accuracy                           0.98       243
           macro avg       0.99      0.98      0.98       243
        weighted avg       0.98      0.98      0.98       243



In [7]:
# ujicoba model dengan inputan baru
def get_response(text):
    # melakukan preprocessing terhadap data inputan baru
    text_slang = process_slang(text)
    text_prep = prep(text_slang)
    
    # melakukan vectorizer terhadap data inputan baru menggunakan Tf-Idf
    text_vector = vectorizer.transform([' '.join(text_prep)])
    
    # mengambil prediksi dan probabilitas dari model
    pred = lr_model.predict(text_vector)
    pred_class = pred[0] 
    probs = lr_model.predict_proba(text_vector)[0]
    max_prob = max(probs)
    
    # respon jika probabilitas diatas 0.8
    prob_predict = "Probabilitas prediksi: " + str(max_prob)
    # batas probabilitas
    if max_prob >= 0.8:
        for intent in data['intents']:
            if intent['tag'] == pred_class:
                responses = intent['responses']
                output = responses[0]
                print(prob_predict)
                return output
    else:
        # respon jika probabilitas dibawah 0.8
        print(prob_predict)
        output = "maksud anda tidak saya mengerti, bisa dijelaskan lebih detail?"

    return output


In [8]:
test_text = "rekomendasi mobil"
response = get_response(test_text)
print("jawaban:",response)

Probabilitas prediksi: 0.8879394117429875
jawaban: Haai sobat CarBot, kamu mau rekomendasi mobil?. Aku bakal ngasih kamu 10 rekomendasi mobil nih, tapi sebelumnya kamu jawab pertanyaan ini ya: 
1. Jenis mobil apa yang kamu mau? 
2. Berapa kilometer pemakaian mobil yang kamu inginkan? 
3. Terakhir berapa range harga yang kamu mau? 
 untuk aturan penginputan seperti berikut ya “#rekomendasi Honda 5000 500000000”. Untuk pertanyaat dijawab secara berurutan ya sobat.


In [9]:
# from joblib import dump
# dump(lr_model, 'Util/lr_nonspark.joblib')

['Util/lr_nonspark.joblib']