<a href="https://colab.research.google.com/github/bryanbayup/phising-detection/blob/main/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!unzip allinone.zip

Archive:  allinone.zip
   creating: app/
   creating: app/models/
  inflating: app/models/model_intent.keras  
  inflating: app/models/model_ner.keras  
   creating: app/data/
  inflating: app/data/vectorizer.pickle  
  inflating: app/data/stopword_list_tala.txt  
  inflating: app/data/dataaa.json    
   creating: app/encoders/
  inflating: app/encoders/tokenizer.pickle  
  inflating: app/encoders/ner_label_encoder.pickle  
  inflating: app/encoders/label_encoder.pickle  


In [3]:
import json
import os
import re
import pickle
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Bidirectional, LSTM, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2

from transformers import (
    AutoTokenizer,
    TFAutoModelForCausalLM,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
import nltk

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
os.chdir('/content/app')
os.makedirs('data', exist_ok=True)
data_path = 'data/data2.json'

with open(data_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Ekstrak user turns untuk intent & NER
utterances = []
intents = []
entities_all = []

for conv in data:
    for turn in conv['turns']:
        if turn['speaker'] == 'user':
            utt = turn['utterance']
            intent = turn['intent']
            ents = turn.get('entities', [])
            utterances.append(utt)
            intents.append(intent)
            entities_all.append(ents)

df = pd.DataFrame({'utterance': utterances, 'intent': intents, 'entities': entities_all})

print("Contoh data intent & NER:")
print(df.head())

Contoh data intent & NER:
                                           utterance  \
0    Saya melihat seekor kucing sakit di depan toko.   
1           Kucing terlihat demam dan bersin-bersin.   
2                  Ada anjing terluka di jalan raya.   
3              Ya, anjing muntah dan terlihat lemas.   
4  Saya menemukan seekor kucing dengan luka di ka...   

                       intent  \
0  Melaporkan Hewan Terlantar   
1         Mendiagnosis Gejala   
2  Melaporkan Hewan Terlantar   
3         Mendiagnosis Gejala   
4  Melaporkan Hewan Terlantar   

                                            entities  
0  [{'entity': 'animal', 'value': 'kucing', 'star...  
1  [{'entity': 'animal', 'value': 'kucing', 'star...  
2  [{'entity': 'animal', 'value': 'anjing', 'star...  
3  [{'entity': 'animal', 'value': 'anjing', 'star...  
4  [{'entity': 'animal', 'value': 'kucing', 'star...  


In [11]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Jika perlu stopwords dan stemming (disesuaikan dengan kebutuhan)
# Contoh tanpa stopwords dan stemming untuk kesederhanaan
df['utterance_clean'] = df['utterance'].apply(clean_text)

In [19]:
!apt install tree

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  tree
0 upgraded, 1 newly installed, 0 to remove and 49 not upgraded.
Need to get 47.9 kB of archives.
After this operation, 116 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tree amd64 2.0.2-1 [47.9 kB]
Fetched 47.9 kB in 0s (121 kB/s)
Selecting previously unselected package tree.
(Reading database ... 123632 files and directories currently installed.)
Preparing to unpack .../tree_2.0.2-1_amd64.deb ...
Unpacking tree (2.0.2-1) ...
Setting up tree (2.0.2-1) ...
Processing triggers for man-db (2.10.2-1) ...


In [22]:
label_encoder = LabelEncoder()
df['intent_label'] = label_encoder.fit_transform(df['intent'])
num_classes = len(label_encoder.classes_)

# Create the 'encoders' directory first
os.makedirs('encoders', exist_ok=True)

# Then, open a file *within* the directory for writing
with open('encoders/label_encoder_new.pickle', 'wb') as f:  # Specify a filename within the directory
    pickle.dump(label_encoder, f)

texts = df['utterance_clean'].tolist()
labels = df['intent_label'].tolist()

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

# Tokenizer untuk intent
tokenizer_intent = tf.keras.preprocessing.text.Tokenizer(oov_token='')
tokenizer_intent.fit_on_texts(train_texts)
word_index = tokenizer_intent.word_index
vocab_size = len(word_index)+1

train_seq = tokenizer_intent.texts_to_sequences(train_texts)
val_seq = tokenizer_intent.texts_to_sequences(val_texts)

max_seq_length = max(max(len(s) for s in train_seq), max(len(s) for s in val_seq))
train_padded = pad_sequences(train_seq, maxlen=max_seq_length, padding='post')
val_padded = pad_sequences(val_seq, maxlen=max_seq_length, padding='post')

train_labels_cat = to_categorical(train_labels, num_classes=num_classes)
val_labels_cat = to_categorical(val_labels, num_classes=num_classes)

# Embedding Sederhana (Random), Anda bisa load FastText jika ada
embedding_dim = 300
embedding_matrix = np.random.normal(scale=0.6, size=(vocab_size, embedding_dim))

with open('encoders/tokenizer_intent_new.pickle', 'wb') as f:
    pickle.dump(tokenizer_intent, f)

In [23]:
def build_intent_model(embedding_matrix, max_seq_length, num_classes, l2_reg=0.001):
    inputs = Input(shape=(max_seq_length,))
    embedding = Embedding(
        input_dim=embedding_matrix.shape[0],
        output_dim=embedding_matrix.shape[1],
        weights=[embedding_matrix],
        input_length=max_seq_length,
        trainable=True
    )(inputs)
    conv = Conv1D(filters=128, kernel_size=3, activation='relu')(embedding)
    pool = GlobalMaxPooling1D()(conv)
    dense = Dense(64, activation='relu', kernel_regularizer=l2(l2_reg))(pool)
    drop = Dropout(0.5)(dense)
    outputs = Dense(num_classes, activation='softmax')(drop)
    model = Model(inputs, outputs)
    return model

model_intent = build_intent_model(embedding_matrix, max_seq_length, num_classes)
model_intent.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model_intent.fit(
    train_padded, train_labels_cat,
    validation_data=(val_padded, val_labels_cat),
    epochs=5,
    batch_size=16
)

os.makedirs('models', exist_ok=True)
model_intent.save('models/model_intent_new.keras')



Epoch 1/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 393ms/step - accuracy: 0.2323 - loss: 2.1279 - val_accuracy: 0.5000 - val_loss: 1.0290
Epoch 2/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6472 - loss: 0.9639 - val_accuracy: 0.7000 - val_loss: 0.7364
Epoch 3/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8151 - loss: 0.5949 - val_accuracy: 0.7667 - val_loss: 0.5978
Epoch 4/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9163 - loss: 0.4270 - val_accuracy: 0.8333 - val_loss: 0.4976
Epoch 5/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9633 - loss: 0.2958 - val_accuracy: 0.8333 - val_loss: 0.4418


In [24]:
# Buat label NER
all_labels = set(['O'])
for ents in df['entities']:
    for ent in ents:
        ent_type = ent['entity']
        all_labels.add('B-' + ent_type)
        all_labels.add('I-' + ent_type)

ner_label_encoder = {label: idx for idx, label in enumerate(sorted(all_labels))}
ner_label_decoder = {idx: label for label, idx in ner_label_encoder.items()}

with open('encoders/ner_label_encoder_new.pickle', 'wb') as f:
    pickle.dump(ner_label_encoder, f)

def create_ner_data(texts, entities_list, tokenizer, max_len):
    X = []
    Y = []
    for text, ents in zip(texts, entities_list):
        words = text.split()
        seq = tokenizer.texts_to_sequences([text])[0]
        label_seq = ['O'] * len(seq)

        # Pemetaan sederhana entitas -> token (berdasarkan substring match)
        # Jika sistem tokenisasi lebih rumit, perbaiki logika ini.
        for ent in ents:
            ent_text = clean_text(ent['value'])
            ent_tokens = ent_text.split()
            for i in range(len(words)-len(ent_tokens)+1):
                if words[i:i+len(ent_tokens)] == ent_tokens:
                    label_seq[i] = 'B-' + ent['entity']
                    for j in range(1, len(ent_tokens)):
                        label_seq[i+j] = 'I-' + ent['entity']
                    break

        # Padding
        if len(seq) < max_len:
            seq += [0]*(max_len-len(seq))
            label_seq += ['O']*(max_len-len(label_seq))
        else:
            seq = seq[:max_len]
            label_seq = label_seq[:max_len]

        X.append(seq)
        Y.append([ner_label_encoder[l] for l in label_seq])

    X = np.array(X)
    Y = np.array(Y)
    Y = to_categorical(Y, num_classes=len(ner_label_encoder))
    return X, Y

X_ner, Y_ner = create_ner_data(df['utterance_clean'].tolist(), df['entities'].tolist(), tokenizer_intent, max_seq_length)
X_ner_train, X_ner_val, Y_ner_train, Y_ner_val = train_test_split(X_ner, Y_ner, test_size=0.2, random_state=42)

def build_ner_model(embedding_matrix, max_seq_length, num_entities):
    inputs = Input(shape=(max_seq_length,))
    embedding = Embedding(
        input_dim=embedding_matrix.shape[0],
        output_dim=embedding_matrix.shape[1],
        weights=[embedding_matrix],
        input_length=max_seq_length,
        trainable=True
    )(inputs)
    lstm = Bidirectional(LSTM(128, return_sequences=True))(embedding)
    drop = Dropout(0.5)(lstm)
    outputs = TimeDistributed(Dense(num_entities, activation='softmax'))(drop)
    model = Model(inputs, outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model_ner = build_ner_model(embedding_matrix, max_seq_length, len(ner_label_encoder))
model_ner.fit(
    X_ner_train, Y_ner_train,
    validation_data=(X_ner_val, Y_ner_val),
    epochs=5,
    batch_size=16
)

model_ner.save('models/model_ner_new.keras')

Epoch 1/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 73ms/step - accuracy: 0.5959 - loss: 1.4359 - val_accuracy: 0.8146 - val_loss: 0.7039
Epoch 2/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.8608 - loss: 0.5364 - val_accuracy: 0.8333 - val_loss: 0.5623
Epoch 3/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.8846 - loss: 0.4165 - val_accuracy: 0.8771 - val_loss: 0.4630
Epoch 4/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9122 - loss: 0.3296 - val_accuracy: 0.8792 - val_loss: 0.3932
Epoch 5/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.9207 - loss: 0.2726 - val_accuracy: 0.8833 - val_loss: 0.3407


In [26]:
!pip install datasets
from datasets import Dataset

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [29]:
# Buat corpus untuk Language Modeling dari dataset percakapan
lm_texts = []
for conv in data:
    for turn in conv['turns']:
        prefix = "User:" if turn['speaker'] == 'user' else "Bot:"
        lm_texts.append(f"{prefix} {turn['utterance']}")
    lm_texts.append("")  # Pisahkan tiap percakapan dengan newline

lm_corpus = "\n".join(lm_texts)
with open('data/lm_corpus.txt', 'w', encoding='utf-8') as f:
    f.write(lm_corpus)

gpt_model_name = "cahya/gpt2-small-indonesian-522M"
gpt_tokenizer = AutoTokenizer.from_pretrained(gpt_model_name)
gpt_model = AutoModelForCausalLM.from_pretrained(gpt_model_name)

# Add a padding token to the tokenizer
if gpt_tokenizer.pad_token is None:
    gpt_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    gpt_model.resize_token_embeddings(len(gpt_tokenizer)) # Important: Resize model embeddings

# Load dataset untuk huggingface
lines = lm_corpus.splitlines()
dataset = Dataset.from_dict({'text': lines})

def tokenize_function(examples):
    return gpt_tokenizer(examples["text"], truncation=True, max_length=128, padding='max_length')

tokenized_dataset = dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=gpt_tokenizer,
    mlm=False,
    return_tensors="pt"
)

training_args = TrainingArguments(
    output_dir="./gpt_finetuned",
    overwrite_output_dir=True,
    per_device_train_batch_size=2,
    num_train_epochs=1,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    logging_dir='./logs',
    do_train=True,
    do_eval=False
)

# Kita hanya punya training dataset, bagi sedikit untuk eval jika perlu
# Misalnya 90/10 split
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = tokenized_dataset['train']
eval_dataset = tokenized_dataset['test']

trainer = Trainer(
    model=gpt_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator
)

trainer.train()

trainer.save_model("./gpt_finetuned")

print("Proses updating model selesai! Model intents, NER dan GPT telah diperbarui sesuai dataset baru.")

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Map:   0%|          | 0/387 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 4


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
100,3.1506


Proses updating model selesai! Model intents, NER dan GPT telah diperbarui sesuai dataset baru.


In [33]:
# Load the label encoder for intent prediction
with open('encoders/label_encoder.pickle', 'rb') as f:
    label_encoder_intent = pickle.load(f)

In [35]:
def predict_intent(utterance):
    seq = tokenizer_intent.texts_to_sequences([utterance])
    padded_seq = tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=model_intent.input_shape[1], padding='post')
    predictions = model_intent.predict(padded_seq)
    intent = label_encoder_intent.inverse_transform([predictions.argmax(axis=1)[0]])  # Use loaded label_encoder_intent
    return intent[0]
def predict_ner(utterance):
    words = utterance.split()
    seq = tokenizer_intent.texts_to_sequences([utterance])[0]
    padded_seq = tf.keras.preprocessing.sequence.pad_sequences([seq], maxlen=model_ner.input_shape[1], padding='post')
    predictions = model_ner.predict(padded_seq).argmax(axis=-1)[0]
    labels = [ner_label_decoder[p] for p in predictions[:len(words)]]
    return list(zip(words, labels))
def generate_response(context, max_length=50):
    input_ids = gpt_tokenizer(context, return_tensors='pt').input_ids
    # Move input_ids to the same device as the model
    input_ids = input_ids.to(gpt_model.device)
    gen_tokens = gpt_model.generate(input_ids, max_length=max_length, num_return_sequences=1, temperature=0.9)
    gen_text = gpt_tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0]
    return gen_text
utterance = "Saya melihat seekor kucing sakit di depan toko."
intent = predict_intent(utterance)
print(f"Intent: {intent}")
ner_results = predict_ner(utterance)
print("Entities:")
for word, label in ner_results:
    print(f"  {word}: {label}")
context = """User: Saya melihat seekor kucing sakit di depan toko.
Bot: Terima kasih atas laporannya. Apakah kucing tersebut menunjukkan gejala seperti demam atau muntah?
User: Kucing terlihat muntah dan lemas."""

response = generate_response(context)
print(f"Bot Response: {response}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Intent: Melaporkan Hewan Terlantar
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Entities:
  Saya: O
  melihat: O
  seekor: O
  kucing: B-animal
  sakit: O
  di: O
  depan: O
  toko.: O
Bot Response: User: Saya melihat seekor kucing sakit di depan toko.
Bot: Terima kasih atas laporannya. Apakah kucing tersebut menunjukkan gejala seperti demam atau muntah?
User: Kucing terlihat muntah dan lemas. Apa yang harus dilakukan???
