Import Library

In [10]:
from google.colab import files
import random
import json
import nltk

!pip install nltk googletrans
from nltk.corpus import wordnet
from googletrans import Translator
import pandas as pd                          # Untuk load dan manipulasi dataset
import numpy as np                           # Operasi numerik
from sklearn.model_selection import train_test_split   # Split data train-test
from sklearn.preprocessing import LabelEncoder         # Encode label kategori

import tensorflow as tf                      # Framework deep learning
from tensorflow.keras.preprocessing.text import Tokenizer       # Tokenisasi teks
from tensorflow.keras.preprocessing.sequence import pad_sequences # Padding sequence
from tensorflow.keras.models import Sequential     # Model Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional



## Data Collection

### Tujuan: Mengimpor / Load data dari Google Form.

In [9]:
# Upload file manual
uploaded = files.upload()

# Ambil nama file dari upload (biasanya hanya satu file)
file_name = list(uploaded.keys())[0]

# Baca file
df = pd.read_csv(file_name)

# Tampilkan 5 baris awal
df

Saving Formulir Minat & Rekomendasi Program Studi  (Jawaban) - Dataset (1).csv to Formulir Minat & Rekomendasi Program Studi  (Jawaban) - Dataset (1) (1).csv


Unnamed: 0,Deskripsi,Label
0,Saya tertarik mempelajari obat-obatan dan baga...,Farmasi
1,Sejak kecil saya ingin menjadi apoteker dan me...,Farmasi
2,"Saya suka biologi dan kimia, terutama tentang ...",Farmasi
3,Saya ingin membuat obat herbal modern dari bah...,Farmasi
4,Saya ingin mendalami cara meracik dan memformu...,Farmasi
...,...,...
2845,Saya berniat membantu orang hidup lebih sehat ...,Ilmu Gizi
2846,Saya tertarik dengan hubungan antara makanan d...,Ilmu Gizi
2847,Diriku ingin menjadi ahli gizi di rumah sakit ...,Ilmu Gizi
2848,Saya ingin mengedukasi masyarakat tentang pent...,Ilmu Gizi


## Data Understanding

### Tujuan: Memahami struktur data dan mengecek isi kolom.

In [11]:
# Lihat semua nama kolom yang tersedia
print(df.columns.tolist())

['Deskripsi', 'Label']


In [12]:
# Informasi ringkas tentang data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2850 entries, 0 to 2849
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Deskripsi  2850 non-null   object
 1   Label      2850 non-null   object
dtypes: object(2)
memory usage: 44.7+ KB


In [13]:
print('Jumlah Prodi : ', len(df['Label'].unique()))

Jumlah Prodi :  18


In [14]:
print('Jumlah Deskripsi : ', len(df['Deskripsi']))

Jumlah Deskripsi :  2850


In [15]:
print('Jumlah Deskripsi Per Label', df.groupby('Label')['Deskripsi'].count())

Jumlah Deskripsi Per Label Label
Akuntansi                250
Arsitektur               250
Ekonomi Pembangunan      100
Farmasi                  100
Hukum                    100
Ilmu Gizi                250
Ilmu Komunikasi          100
Ilmu Politik             100
Kedokteran               100
Manajemen                250
Pendidikan Guru          250
Psikologi                100
Sistem Informasi         100
Statistika/Matematika    100
Teknik Elektro           100
Teknik Informatika       100
Teknik Mesin             250
Teknik Sipil             250
Name: Deskripsi, dtype: int64


## Data Preprocessing

### Tujuan: Membersihkan dan menyiapkan data.

In [16]:
# Cek missing value
print("\nCek Missing Values:")
print(df.isnull().sum())


Cek Missing Values:
Deskripsi    0
Label        0
dtype: int64


In [17]:
df['cleaned_text'] = df['Deskripsi'].str.lower()

In [18]:
df

Unnamed: 0,Deskripsi,Label,cleaned_text
0,Saya tertarik mempelajari obat-obatan dan baga...,Farmasi,saya tertarik mempelajari obat-obatan dan baga...
1,Sejak kecil saya ingin menjadi apoteker dan me...,Farmasi,sejak kecil saya ingin menjadi apoteker dan me...
2,"Saya suka biologi dan kimia, terutama tentang ...",Farmasi,"saya suka biologi dan kimia, terutama tentang ..."
3,Saya ingin membuat obat herbal modern dari bah...,Farmasi,saya ingin membuat obat herbal modern dari bah...
4,Saya ingin mendalami cara meracik dan memformu...,Farmasi,saya ingin mendalami cara meracik dan memformu...
...,...,...,...
2845,Saya berniat membantu orang hidup lebih sehat ...,Ilmu Gizi,saya berniat membantu orang hidup lebih sehat ...
2846,Saya tertarik dengan hubungan antara makanan d...,Ilmu Gizi,saya tertarik dengan hubungan antara makanan d...
2847,Diriku ingin menjadi ahli gizi di rumah sakit ...,Ilmu Gizi,diriku ingin menjadi ahli gizi di rumah sakit ...
2848,Saya ingin mengedukasi masyarakat tentang pent...,Ilmu Gizi,saya ingin mengedukasi masyarakat tentang pent...


Augmentasi data teks menggunakan dua teknik:
1. Synonym Replacement
2. Back Translation


In [19]:
# Download resource
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Inisialisasi translator
translator = Translator()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


 FUNGSI 1: Synonym Replacement

In [20]:
def synonym_replacement(text, n=2):
    words = nltk.word_tokenize(text)
    new_words = words.copy()
    random.shuffle(words)

    num_replaced = 0
    for word in words:
        synonyms = set()
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                if lemma.name().lower() != word.lower():
                    synonyms.add(lemma.name().replace("_", " "))
        if synonyms:
            new_words = [w if w != word else random.choice(list(synonyms)) for w in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break

    return ' '.join(new_words)


FUNGSI 2: Back Translation


In [21]:
def back_translation(text, lang='en'):
    try:
        translated = translator.translate(text, dest=lang).text
        back_translated = translator.translate(translated, dest='id').text
        return back_translated
    except Exception as e:
        print("Error in back translation:", e)
        return text


In [22]:
# Buat Data Tambahan Otomatis untuk Label yang Kurang dari 250
target_count = 250
current_counts = df['Label'].value_counts()
labels_to_expand = current_counts[current_counts < target_count]

# Define the missing function
def generate_descriptions(label, count):
    generated_list = []
    # Example placeholder: Simply create dummy descriptions
    # You should replace this with a proper text generation method
    for i in range(count):
        generated_list.append(f"Ini adalah deskripsi tambahan untuk {label} {i+1}")
    return generated_list


additional_data = []
for label, current_count in labels_to_expand.items():
    missing_count = target_count - current_count
    # Call the newly defined function
    new_descriptions = generate_descriptions(label, missing_count)
    for desc in new_descriptions:
        additional_data.append({"Deskripsi": desc, "Label": label})

# Gabungkan ke DataFrame Asli
df_extra = pd.DataFrame(additional_data)
df_augmented = pd.concat([df, df_extra], ignore_index=True)

# Cek Distribusi Terbaru
print("Distribusi label setelah augmentasi:")
print(df_augmented['Label'].value_counts().sort_index())

Distribusi label setelah augmentasi:
Label
Akuntansi                250
Arsitektur               250
Ekonomi Pembangunan      250
Farmasi                  250
Hukum                    250
Ilmu Gizi                250
Ilmu Komunikasi          250
Ilmu Politik             250
Kedokteran               250
Manajemen                250
Pendidikan Guru          250
Psikologi                250
Sistem Informasi         250
Statistika/Matematika    250
Teknik Elektro           250
Teknik Informatika       250
Teknik Mesin             250
Teknik Sipil             250
Name: count, dtype: int64


In [23]:
# Encode label string ke angka
label_encoder = LabelEncoder()
df["label_id"] = label_encoder.fit_transform(df["Label"])

# Simpan mapping label ke nama jurusan
label_map = {i: label for i, label in enumerate(label_encoder.classes_)}

# Simpan ke file JSON
with open("label_map.json", "w") as f:
    json.dump(label_map, f)

## Modelling

### Tujuan: Membangun sistem rekomendasi.

In [24]:
max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['cleaned_text'])
sequences = tokenizer.texts_to_sequences(df['cleaned_text'])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

num_classes = df["label_id"].nunique()

# Simpan word_index ke file
with open('word_index.json', 'w') as f:
    json.dump(tokenizer.word_index, f)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, df['label_id'], test_size=0.2, random_state=42)

In [26]:
model2 = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=64, input_length=100),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(df["label_id"].nunique(), activation='softmax')  # num_classes
])

model2.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)



In [27]:
model2.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32
)

Epoch 1/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 199ms/step - accuracy: 0.0773 - loss: 2.8552
Epoch 2/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 113ms/step - accuracy: 0.2291 - loss: 2.3708
Epoch 3/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 101ms/step - accuracy: 0.6400 - loss: 1.1779
Epoch 4/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 84ms/step - accuracy: 0.7713 - loss: 0.7256
Epoch 5/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 85ms/step - accuracy: 0.8762 - loss: 0.4094
Epoch 6/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 101ms/step - accuracy: 0.9255 - loss: 0.2459
Epoch 7/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 97ms/step - accuracy: 0.9557 - loss: 0.1672
Epoch 8/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 85ms/step - accuracy: 0.9771 - loss: 0.0948
Epoch 9/10
[1m72/72[0m [32m━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7cc1afb06e10>

In [28]:
model3 = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    Bidirectional(LSTM(64, return_sequences=False)),  # tambah Bidirectional
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

model3.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [29]:
history = model3.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32
)

Epoch 1/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 125ms/step - accuracy: 0.0955 - loss: 2.8455
Epoch 2/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 142ms/step - accuracy: 0.3380 - loss: 2.0450
Epoch 3/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 108ms/step - accuracy: 0.6598 - loss: 1.0224
Epoch 4/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 108ms/step - accuracy: 0.7440 - loss: 0.7235
Epoch 5/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 113ms/step - accuracy: 0.8450 - loss: 0.4595
Epoch 6/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 124ms/step - accuracy: 0.8990 - loss: 0.3460
Epoch 7/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 125ms/step - accuracy: 0.9431 - loss: 0.1973
Epoch 8/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 108ms/step - accuracy: 0.9502 - loss: 0.1629
Epoch 9/10
[1m72/72[0m [32m━━━━━

## Evaluasi

### Tujuan: Fungsi untuk memberikan rekomendasi jurusan.

In [30]:
def predict(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post')
    prediction = model2.predict(padded_sequence)
    predicted_label = label_encoder.inverse_transform([np.argmax(prediction)])
    return predicted_label[0]

print(predict("saya suka kimia dan membuat obat obatan"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 373ms/step
Farmasi


# Save Model

In [31]:
model2.save('model.h5')



In [32]:
# Install tensorflowjs
!pip install tensorflowjs

Collecting tensorflowjs
  Downloading tensorflowjs-4.22.0-py3-none-any.whl.metadata (3.2 kB)
Collecting packaging~=23.1 (from tensorflowjs)
  Downloading packaging-23.2-py3-none-any.whl.metadata (3.2 kB)
Downloading tensorflowjs-4.22.0-py3-none-any.whl (89 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.1/89.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading packaging-23.2-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: packaging, tensorflowjs
  Attempting uninstall: packaging
    Found existing installation: packaging 24.2
    Uninstalling packaging-24.2:
      Successfully uninstalled packaging-24.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gradio-client 1.10.1 requires httpx>=0.24.1

In [33]:
!tensorflowjs_converter --input_format=keras model.h5 tfjs_model_keras

2025-06-10 18:55:44.378420: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749581744.409210    3942 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749581744.417378    3942 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[32m🌲 Try [0m[34mhttps://ydf.readthedocs.io[0m[32m, the successor of TensorFlow Decision Forests with more features and faster training![0m
failed to lookup keras version from the file,
    this is likely a weight only file


In [34]:
!zip  -r tfjs_model_fix.zip tfjs_model_keras


  adding: tfjs_model_keras/ (stored 0%)
  adding: tfjs_model_keras/model.json (deflated 83%)
  adding: tfjs_model_keras/group1-shard1of1.bin (deflated 8%)


In [35]:
files.download('tfjs_model_fix.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [36]:
from tensorflow import keras

try:
    model = keras.models.load_model("model.h5")
    print("✅ Model berhasil dimuat.")
except Exception as e:
    print("❌ Gagal memuat model:", e)



✅ Model berhasil dimuat.
