In [1]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import RMSprop
import tensorflow as tf
import pandas as pd
import numpy as np

### Cleaning Datasets

In [2]:
dataset = pd.read_csv('resesi-class.csv')

In [3]:
dataset.tail()

Unnamed: 0.1,Unnamed: 0,full_text,sentiment
3385,3385,Survei Sebut Potensi Indonesia Resesi Ekonomi ...,positive
3386,3386,Indonesia hanya berptensi sebesar tiga persen ...,positive
3387,3387,Moga resesi tak terjadi di Indonesia. #Wujudka...,positive
3388,3388,Indonesia hanya berpotensi sebesar 3% untuk re...,positive
3389,3389,Indonesia hanya memiliki potensi sebesar tiga ...,positive


In [4]:
# Cleaning The Datasets
dataset['full_text'] = dataset['full_text'].str.encode('ascii', 'replace').str.decode('ascii')
dataset['full_text'] = dataset['full_text'].str.replace(r'[?]+', ' ').str.replace(r'RT @', ' ').str.replace(r'((www\.[^\s]+)|(https?://[^\s]+))', ' ').str.replace(r'@[^\s]+', '').str.replace(r'\\n', ' ').str.replace(r'[\s]+', ' ').str.replace('\s+', ' ', regex=True).str.replace(r'#([^\s]+)', r'')

  dataset['full_text'] = dataset['full_text'].str.replace(r'[?]+', ' ').str.replace(r'RT @', ' ').str.replace(r'((www\.[^\s]+)|(https?://[^\s]+))', ' ').str.replace(r'@[^\s]+', '').str.replace(r'\\n', ' ').str.replace(r'[\s]+', ' ').str.replace('\s+', ' ', regex=True).str.replace(r'#([^\s]+)', r'')


In [5]:
dataset.tail()

Unnamed: 0.1,Unnamed: 0,full_text,sentiment
3385,3385,Survei Sebut Potensi Indonesia Resesi Ekonomi ...,positive
3386,3386,Indonesia hanya berptensi sebesar tiga persen ...,positive
3387,3387,Moga resesi tak terjadi di Indonesia.,positive
3388,3388,Indonesia hanya berpotensi sebesar 3% untuk re...,positive
3389,3389,Indonesia hanya memiliki potensi sebesar tiga ...,positive


In [6]:
dataset.to_csv('resesi-class-clean.csv', index = False)

### Split Datasets, Tokenize, Padding

In [7]:
#Parameters
vocab_size = 1000
embedding_dim = 16
max_length = 50
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .9

In [8]:
# Split Datasets
x_train, x_test = train_test_split(dataset['full_text'], train_size = training_portion, shuffle = False)
y_train, y_test = train_test_split(dataset['sentiment'], train_size = training_portion, shuffle = False)

In [17]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(3051,)
(339,)
(3051,)
(339,)


In [9]:
#Tokenize and Pad Sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)
tokenizer.fit_on_texts(x_train)

train_sequences = tokenizer.texts_to_sequences(x_train)
train_padding = pad_sequences(train_sequences, maxlen = max_length, padding = padding_type, truncating = trunc_type)

test_sequences = tokenizer.texts_to_sequences(x_test)
test_padding = pad_sequences(test_sequences, maxlen = max_length, padding = padding_type, truncating = trunc_type)

In [10]:
#Tokenize and Pad Labels
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(dataset['sentiment'])

train_label_seq = label_tokenizer.texts_to_sequences(y_train)
test_label_seq = label_tokenizer.texts_to_sequences(y_test)

In [11]:
train_label_seq = np.array(train_label_seq)
test_label_seq = np.array(test_label_seq)

In [18]:
print(x_train.shape)
print(x_test.shape)
print(train_label_seq.shape)
print(test_label_seq.shape)

(3051,)
(339,)
(3051, 1)
(339, 1)


In [12]:
#Model Architecture
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])

In [13]:
#Model Train
model.compile(
    optimizer = 'adam',
    loss = 'sparse_categorical_crossentropy',
    metrics = ['accuracy']
)

model.fit(
    train_padding,
    train_label_seq,
    epochs = 50,
    validation_data = (test_padding, test_label_seq),
    verbose = 1
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x21399bb16d0>

In [14]:
# Get the index-word dictionary
reverse_word_index = tokenizer.index_word

# Get the embedding layer from the model (i.e. first layer)
embedding_layer = model.layers[0]

# Get the weights of the embedding layer
embedding_weights = embedding_layer.get_weights()[0]

# Print the shape. Expected is (vocab_size, embedding_dim)
print(embedding_weights.shape)

(1000, 16)


In [15]:
import io

# Open writeable files
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

# Initialize the loop. Start counting at `1` because `0` is just for the padding
for word_num in range(1, vocab_size):

  # Get the word associated at the current index
  word_name = reverse_word_index[word_num]

  # Get the embedding weights associated with the current index
  word_embedding = embedding_weights[word_num]

  # Write the word name
  out_m.write(word_name + "\n")

  # Write the word embedding
  out_v.write('\t'.join([str(x) for x in word_embedding]) + "\n")

# Close the files
out_v.close()
out_m.close()