In [3]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import RMSprop
import tensorflow as tf
import pandas as pd
import numpy as np

### Cleaning Datasets

In [4]:
dataset = pd.read_csv('dataset/resesi-class.csv')

In [5]:
dataset.tail()

Unnamed: 0.1,Unnamed: 0,full_text,sentiment
3385,3385,Survei Sebut Potensi Indonesia Resesi Ekonomi ...,positive
3386,3386,Indonesia hanya berptensi sebesar tiga persen ...,positive
3387,3387,Moga resesi tak terjadi di Indonesia. #Wujudka...,positive
3388,3388,Indonesia hanya berpotensi sebesar 3% untuk re...,positive
3389,3389,Indonesia hanya memiliki potensi sebesar tiga ...,positive


In [6]:
# Cleaning The Datasets
dataset['full_text'] = dataset['full_text'].str.encode('ascii', 'replace').str.decode('ascii')
dataset['full_text'] = dataset['full_text'].str.replace(r'[?]+', ' ').str.replace(r'RT @', ' ').str.replace(r'((www\.[^\s]+)|(https?://[^\s]+))', ' ').str.replace(r'@[^\s]+', '').str.replace(r'\\n', ' ').str.replace(r'[\s]+', ' ').str.replace('\s+', ' ', regex=True).str.replace(r'#([^\s]+)', r'')

  dataset['full_text'] = dataset['full_text'].str.replace(r'[?]+', ' ').str.replace(r'RT @', ' ').str.replace(r'((www\.[^\s]+)|(https?://[^\s]+))', ' ').str.replace(r'@[^\s]+', '').str.replace(r'\\n', ' ').str.replace(r'[\s]+', ' ').str.replace('\s+', ' ', regex=True).str.replace(r'#([^\s]+)', r'')


In [7]:
dataset.tail()

Unnamed: 0.1,Unnamed: 0,full_text,sentiment
3385,3385,Survei Sebut Potensi Indonesia Resesi Ekonomi ...,positive
3386,3386,Indonesia hanya berptensi sebesar tiga persen ...,positive
3387,3387,Moga resesi tak terjadi di Indonesia.,positive
3388,3388,Indonesia hanya berpotensi sebesar 3% untuk re...,positive
3389,3389,Indonesia hanya memiliki potensi sebesar tiga ...,positive


In [8]:
dataset.to_csv('resesi-class-clean.csv', index = False)

### Split Datasets, Tokenize, Padding

In [9]:
#Parameters
vocab_size = 1000
embedding_dim = 16
max_length = 50
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .9

In [10]:
# Split Datasets
x_train, x_test = train_test_split(dataset['full_text'], train_size = training_portion, shuffle = False)
y_train, y_test = train_test_split(dataset['sentiment'], train_size = training_portion, shuffle = False)

In [11]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(3051,)
(339,)
(3051,)
(339,)


In [12]:
#Tokenize and Pad Sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)
tokenizer.fit_on_texts(x_train)

train_sequences = tokenizer.texts_to_sequences(x_train)
train_padding = pad_sequences(train_sequences, maxlen = max_length, padding = padding_type, truncating = trunc_type)

test_sequences = tokenizer.texts_to_sequences(x_test)
test_padding = pad_sequences(test_sequences, maxlen = max_length, padding = padding_type, truncating = trunc_type)

In [13]:
#Tokenize and Pad Labels
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(dataset['sentiment'])

train_label_seq = label_tokenizer.texts_to_sequences(y_train)
test_label_seq = label_tokenizer.texts_to_sequences(y_test)

In [14]:
train_label_seq = np.array(train_label_seq)
test_label_seq = np.array(test_label_seq)

In [15]:
print(x_train.shape)
print(x_test.shape)
print(train_label_seq.shape)
print(test_label_seq.shape)

(3051,)
(339,)
(3051, 1)
(339, 1)


In [16]:
#Model Architecture
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])

In [17]:
#Model Train
model.compile(
    optimizer = 'adam',
    loss = 'sparse_categorical_crossentropy',
    metrics = ['accuracy']
)

model.fit(
    train_padding,
    train_label_seq,
    epochs = 50,
    validation_data = (test_padding, test_label_seq),
    verbose = 1
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
22/96 [=====>........................] - ETA: 0s - loss: nan - accuracy: 0.0000e+00

KeyboardInterrupt: 

In [None]:
# Get the index-word dictionary
reverse_word_index = tokenizer.index_word

# Get the embedding layer from the model (i.e. first layer)
embedding_layer = model.layers[0]

# Get the weights of the embedding layer
embedding_weights = embedding_layer.get_weights()[0]

# Print the shape. Expected is (vocab_size, embedding_dim)
print(embedding_weights.shape)

In [None]:
import io

# Open writeable files
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

# Initialize the loop. Start counting at `1` because `0` is just for the padding
for word_num in range(1, vocab_size):

  # Get the word associated at the current index
  word_name = reverse_word_index[word_num]

  # Get the embedding weights associated with the current index
  word_embedding = embedding_weights[word_num]

  # Write the word name
  out_m.write(word_name + "\n")

  # Write the word embedding
  out_v.write('\t'.join([str(x) for x in word_embedding]) + "\n")

# Close the files
out_v.close()
out_m.close()