In [10]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization
import os
import io



In [None]:
dataset_conservative = tf.keras.utils.get_file("conservative.txt", 
"https://github.com/JerryWei03/NewB/raw/master/conservative.txt",
                                  cache_dir='.',
                                  cache_subdir='')

dataset_liberal = tf.keras.utils.get_file("liberal.txt", 
"https://github.com/JerryWei03/NewB/raw/master/liberal.txt",
                                  cache_dir='.',
                                  cache_subdir='')




In [5]:
dataset_test = tf.keras.utils.get_file("test.txt", 
"https://github.com/JerryWei03/NewB/raw/master/test.txt",
                                  cache_dir='.',
                                  cache_subdir='')

Downloading data from https://github.com/JerryWei03/NewB/raw/master/test.txt


In [22]:
with open("conservative.txt", 'r', encoding="utf-8") as f:
    counter = 0
    lines = f.readlines()
    for s in lines:
        name = "dataset/conservative/" + str(counter) + ".txt"
        with open(name, 'w', encoding='utf-8') as d:
            d.write(s.split(maxsplit=1)[1])
        counter += 1



In [1]:
with open("liberal.txt", 'r', encoding="utf-8") as f:
    counter = 0
    lines = f.readlines()
    for s in lines:
        name = "dataset/liberal/" + str(counter) + ".txt"
        with open(name, 'w', encoding='utf-8') as d:
            d.write(s.split(maxsplit=1)[1])
        counter += 1


In [1]:

total_words = 0
conservative_lines = 0
liberal_lines = 0
with open("conservative.txt", 'r', encoding="utf-8") as f:
    lines = f.readlines()
    conservative_lines = len(lines)
    for s in lines:
        total_words += len(s.split())

with open("liberal.txt", 'r', encoding="utf-8") as f:
    lines = f.readlines()
    liberal_lines = len(lines)
    for s in lines:
        total_words += len(s.split())
    

    print("Average line length: " + str(total_words / (conservative_lines + liberal_lines)))

Average line length: 24.01202808720905


Create both datasets with conservative labeled 0 and liberal labeled 1

In [5]:
batch_size = 1024
seed = 39

train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "dataset", batch_size = batch_size, validation_split = 0.2, 
    subset = 'training', seed = seed
)

val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "dataset", batch_size = batch_size, validation_split = 0.2, 
    subset = 'validation', seed = seed
)

AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size =  AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size =  AUTOTUNE)

sequence_length = 24
vocab_size = 10000

vectorize_layer = TextVectorization(
    max_tokens = vocab_size,
    output_mode = 'int', 
    output_sequence_length = sequence_length)

text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)
    

Found 230710 files belonging to 2 classes.
Using 184568 files for training.
Found 230710 files belonging to 2 classes.
Using 46142 files for validation.


In [6]:
embedding_dim = 16

model = Sequential([
    vectorize_layer, 
    Embedding(vocab_size, embedding_dim, name = 'embedding'),
    GlobalAveragePooling1D(),
    Dense(16, activation='relu'),
    Dense(1)
])

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=15,
    callbacks=[tensorboard_callback])

    




Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x161ef9e64f0>

In [8]:
weights = model.get_layer('embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [11]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()