In [78]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import RMSprop
from nltk import TweetTokenizer
import re
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns

### Loading Dataset

In [79]:
dataset = pd.read_csv('dataset/resesi-class.csv')

In [80]:
dataset.tail()

Unnamed: 0.1,Unnamed: 0,full_text,sentiment
3385,3385,Survei Sebut Potensi Indonesia Resesi Ekonomi ...,positive
3386,3386,Indonesia hanya berptensi sebesar tiga persen ...,positive
3387,3387,Moga resesi tak terjadi di Indonesia. #Wujudka...,positive
3388,3388,Indonesia hanya berpotensi sebesar 3% untuk re...,positive
3389,3389,Indonesia hanya memiliki potensi sebesar tiga ...,positive


### Cleaning Dataset

In [81]:
# Buat Semua Kata Menjadi Huruf Kecil
dataset['full_text'] = dataset['full_text'].str.lower()

# Fungsi Untuk Mengubah Angka Menjadi Teks
def angka_ke_teks(match):
    angka = match.group(0)
    terjemahan = {
        '0': 'nol',
        '1': 'satu',
        '2': 'dua',
        '3': 'tiga',
        '4': 'empat',
        '5': 'lima',
        '6': 'enam',
        '7': 'tujuh',
        '8': 'delapan',
        '9': 'sembilan'
    }
    return terjemahan.get(angka, angka)

for i in range(len(dataset)):
    # Hapus Karakter Yang Tidak Ada Pada ASCII
    dataset['full_text'][i] = re.sub(r'[^\x00-\x7F]+', ' ', dataset['full_text'][i])
    
    # Hapus Web URL dan RT
    dataset['full_text'][i] = re.sub(r'(?:\rt @|http?\://|https?\://|www)\S+', '', dataset['full_text'][i])
    
    # Hapus Mention
    dataset['full_text'][i] = re.sub(r'@[^\s]+', '', dataset['full_text'][i])
    
    # Hapus Newline
    dataset['full_text'][i] = re.sub(r'\\n', ' ', dataset['full_text'][i])
    
    # Hapus Tanda Baca
    dataset['full_text'][i] = re.sub(r'[^\w\s#%]', '', dataset['full_text'][i])
    
    # Ubah Angka Menjadi Teks
    dataset['full_text'][i] = re.sub(r'\d', angka_ke_teks, dataset['full_text'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['full_text'][i] = re.sub(r'[^\x00-\x7F]+', ' ', dataset['full_text'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['full_text'][i] = re.sub(r'(?:\rt @|http?\://|https?\://|www)\S+', '', dataset['full_text'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['full_text'][i] = re.sub(r'@[^\s]+', '', dataset['full_text'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pand

In [82]:
dataset.tail()

Unnamed: 0.1,Unnamed: 0,full_text,sentiment
3385,3385,survei sebut potensi indonesia resesi ekonomi ...,positive
3386,3386,indonesia hanya berptensi sebesar tiga persen ...,positive
3387,3387,moga resesi tak terjadi di indonesia #wujudkan...,positive
3388,3388,indonesia hanya berpotensi sebesar tiga% untuk...,positive
3389,3389,indonesia hanya memiliki potensi sebesar tiga ...,positive


In [83]:
dataset.to_csv('dataset/resesi-class-clean.csv', index = False)

### Split Datasets, Tokenize, Padding

In [84]:
#Parameters
vocab_size = 1000
embedding_dim = 16
max_length = 50
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .9

In [85]:
# Split Datasets
x_train, x_test = train_test_split(dataset['full_text'], train_size = training_portion, shuffle = False)
y_train, y_test = train_test_split(dataset['sentiment'], train_size = training_portion, shuffle = False)

In [86]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(3051,)
(339,)
(3051,)
(339,)


In [87]:
#Tokenize and Pad Sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)
tokenizer.fit_on_texts(x_train)

train_sequences = tokenizer.texts_to_sequences(x_train)
train_padding = pad_sequences(train_sequences, maxlen = max_length, padding = padding_type, truncating = trunc_type)

test_sequences = tokenizer.texts_to_sequences(x_test)
test_padding = pad_sequences(test_sequences, maxlen = max_length, padding = padding_type, truncating = trunc_type)

In [88]:
#Tokenize and Pad Labels
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(dataset['sentiment'])

train_label_seq = label_tokenizer.texts_to_sequences(y_train)
test_label_seq = label_tokenizer.texts_to_sequences(y_test)

In [89]:
train_label_seq = np.array(train_label_seq)
test_label_seq = np.array(test_label_seq)

In [90]:
print(x_train.shape)
print(x_test.shape)
print(train_label_seq.shape)
print(test_label_seq.shape)

(3051,)
(339,)
(3051, 1)
(339, 1)


In [91]:
#Model Architecture
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.SpatialDropout1D(0.2),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100, return_sequences = True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100)),
    tf.keras.layers.Dense(64, activation = 'relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(3, activation = 'softmax')
])

In [93]:
#Model Train
model.compile(
    optimizer = 'adam',
    loss = 'sparse_categorical_crossentropy',
    metrics = ['accuracy']
)

mode.fit(
    train_padding,
    train_label_seq,
    epochs = 30,
    validation_data = (test_padding, test_label_seq),
    verbose = 2
)

Epoch 1/50


ValueError: in user code:

    File "D:\Project Pribadi\Model ML 1\venv\lib\site-packages\keras\engine\training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "D:\Project Pribadi\Model ML 1\venv\lib\site-packages\keras\engine\training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "D:\Project Pribadi\Model ML 1\venv\lib\site-packages\keras\engine\training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "D:\Project Pribadi\Model ML 1\venv\lib\site-packages\keras\engine\training.py", line 1024, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "D:\Project Pribadi\Model ML 1\venv\lib\site-packages\keras\engine\training.py", line 1082, in compute_loss
        return self.compiled_loss(
    File "D:\Project Pribadi\Model ML 1\venv\lib\site-packages\keras\engine\compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "D:\Project Pribadi\Model ML 1\venv\lib\site-packages\keras\losses.py", line 152, in __call__
        losses = call_fn(y_true, y_pred)
    File "D:\Project Pribadi\Model ML 1\venv\lib\site-packages\keras\losses.py", line 284, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "D:\Project Pribadi\Model ML 1\venv\lib\site-packages\keras\losses.py", line 2004, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "D:\Project Pribadi\Model ML 1\venv\lib\site-packages\keras\backend.py", line 5532, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 1) and (None, 3) are incompatible


In [None]:
# Get the index-word dictionary
reverse_word_index = tokenizer.index_word

# Get the embedding layer from the model (i.e. first layer)
embedding_layer = model.layers[0]

# Get the weights of the embedding layer
embedding_weights = embedding_layer.get_weights()[0]

# Print the shape. Expected is (vocab_size, embedding_dim)
print(embedding_weights.shape)

In [None]:
import io

# Open writeable files
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

# Initialize the loop. Start counting at `1` because `0` is just for the padding
for word_num in range(1, vocab_size):

  # Get the word associated at the current index
  word_name = reverse_word_index[word_num]

  # Get the embedding weights associated with the current index
  word_embedding = embedding_weights[word_num]

  # Write the word name
  out_m.write(word_name + "\n")

  # Write the word embedding
  out_v.write('\t'.join([str(x) for x in word_embedding]) + "\n")

# Close the files
out_v.close()
out_m.close()