# Prepare

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#Copy model.h5 & tokenizer
!cp '/content/drive/My Drive/Machine Learning/Quora/bi_quora_question_pairs.h5' '/content/' #Mengambil model.h5 dari proses sebelumnya
!cp '/content/drive/My Drive/Machine Learning/Quora/tokenizer.pkl' '/content/' #Mengambil tokenizer dari proses sebelumnya

# Text

## Import

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from nltk import word_tokenize
from keras.models import load_model
import pickle
import time
import numpy as np
import nltk
nltk.download('popular')
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('indonesian'))

## Run

In [4]:
with open('tokenizer.pkl', 'rb') as handle:
	tokenizer = pickle.load(handle)

def clean_question(question):
	tokens = word_tokenize(question)
	tokens = [t for t in tokens if t.isalpha()]
	tokens = ' '.join(tokens)
	return tokens

def process_question(question):
	clean_q = []
	for q in question:
		q = str(q)
		q = clean_question(q)
		clean_q.append(q)
	return clean_q

def make_prediction(model_path, data):
	model = load_model(model_path)
	y_pred = model.predict(data)

	return y_pred


def tokenize_pad_questions(question_1, question_2):
	question_1 = process_question(question_1)
	question_2 = process_question(question_2)

	question1_word_sequence = tokenizer.texts_to_sequences(question_1)
	question2_word_sequence = tokenizer.texts_to_sequences(question_2)

	q1_data = pad_sequences(question1_word_sequence, maxlen = 30, padding = 'post')
	q2_data = pad_sequences(question2_word_sequence, maxlen = 30, padding = 'post')

	return q1_data, q2_data


def clean_results(result):
	if np.round(result) == 1:
		return 'Pertanyaan Duplikat'
	else: 
		return 'Pertanyaan Berbeda'	

first_question = []
second_question = []

#Contoh Tabel 4.2 pertanyaan 1 dan 2 yang ingin dibandingkan
question_1 = "Apa yang harus diperhatikan sebelum membeli keyboard komputer"
question_2 = "Apa saja perlu dipertimbangkan sebelum membeli keyboard komputer"

first_question.append(question_1)
second_question.append(question_2)

q1_data, q2_data = tokenize_pad_questions(first_question, second_question)
y_pred = make_prediction('bi_quora_question_pairs.h5', [q1_data, q2_data])
y_pred_clean = clean_results(y_pred)



In [5]:
print(f"Dua pertanyaan tersebut hasilnya: *{y_pred_clean}* dengan {np.round(y_pred[0][0].astype(float),2)*100}% probability")

Dua pertanyaan tersebut hasilnya: *Pertanyaan Duplikat* dengan 98.0% probability


In [None]:
y_pred

array([[0.9783343]], dtype=float32)

# Streamlit

## Script

In [None]:
%%writefile config.py

FILE_DIR = '/content/'
MODEL = 'bi_quora_question_pairs.h5'
TOKENIZER = 'tokenizer.pkl'

MAX_SEQUENCE_LENGTH = 20


In [None]:
%%writefile process.py
import numpy as np
import nltk
nltk.download('popular')
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
stop=set(stopwords.words('indonesian'))

from nltk import word_tokenize
from keras.models import load_model


def clean_question(question):
	tokens = word_tokenize(question)
	tokens = [t for t in tokens if t.isalpha()]
	tokens = ' '.join(tokens)
	return tokens



def process_question(question):
	clean_q = []
	for q in question:
		q = str(q)
		q = clean_question(q)
		clean_q.append(q)
	return clean_q



def make_prediction(model_path, data):
	model = load_model(model_path)
	y_pred = model.predict(data)

	return y_pred
  

In [None]:
%%writefile app.py
import streamlit as st
from keras.preprocessing.sequence import pad_sequences
import pickle
import time
import numpy as np

import config
from process import process_question, clean_question, make_prediction




with open(config.FILE_DIR + config.TOKENIZER, 'rb') as handle:
	tokenizer = pickle.load(handle)


def tokenize_pad_questions(question_1, question_2):
	question_1 = process_question(question_1)
	question_2 = process_question(question_2)

	question1_word_sequence = tokenizer.texts_to_sequences(question_1)
	question2_word_sequence = tokenizer.texts_to_sequences(question_2)

	q1_data = pad_sequences(question1_word_sequence, maxlen = config.MAX_SEQUENCE_LENGTH, padding = 'post')
	q2_data = pad_sequences(question2_word_sequence, maxlen = config.MAX_SEQUENCE_LENGTH, padding = 'post')

	return q1_data, q2_data


def clean_results(result):

	if np.round(result) == 1:
		return 'Pertanyaan Duplikat'
	else: 
		return 'Pertanyaan Berbeda'	



def run():
	first_question = []
	second_question = []

	st.title('Identifikasi Duplikat Pertanyaan')
	st.text('')
	st.subheader('Deskripsi')
	st.markdown('Dengan lebih dari 100 juta orang mengunjungi Quora setiap bulan, banyak orang mengajukan pertanyaan serupa. Menggunakan GloVe + BiLSTM untuk mengidentifikasi pertanyaan-pertanyaan ini secara akurat akan membantu pengguna menemukan jawaban dengan lebih efektif dan efisien.')
	st.text('')

	question_1 = st.text_input('Apa pertanyaan pertama Anda?')
	question_2 = st.text_input('Apa pertanyaan kedua Anda?')

	first_question.append(question_1)
	second_question.append(question_2)

	if st.button('Predict'):
		with st.spinner('Sedang mengidentifikasi pertanyaan...'):
			if question_1 is not '' and question_2 is not '':
				q1_data, q2_data = tokenize_pad_questions(first_question, second_question)
				y_pred = make_prediction(config.FILE_DIR + config.MODEL, [q1_data, q2_data])
				y_pred_clean = clean_results(y_pred)
			else:
				st.write('[INFO] Tidak ada pertanyaan.. Silahkan tulis pertanyaan')


			st.success(f"Dua pertanyaan tersebut hasilnya: **{y_pred_clean}** dengan **{np.round(y_pred[0][0].astype(float),2)*100}%** probability")



if __name__ == '__main__':

	run()





## Run

In [None]:
!pip install streamlit
!pip install ngrok

In [1]:
!streamlit run app.py &>/dev/null&

In [None]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip -qq ngrok-stable-linux-amd64.zip

In [None]:
get_ipython().system_raw('./ngrok http 8501 &')
! curl -s http://localhost:4040/api/tunnels | python3 -c \
  "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

In [None]:
!streamlit run app.py