In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np

from tensorflow.keras.layers import TextVectorization, Embedding
from tensorflow.keras import Sequential

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def label_data(x):
    article = tf.strings.split(x, maxsplit=1)[0]
    if article < 5:
        label = 1
    elif article > 5:
        label = 0
    else:
        label = 0.5
    
    return tf.strings.split(x, maxsplit=1)[1], label


batch_size = 1024
seed = 39
sequence_length = 25
vocab_size = 100000
embedding_dim = 128

conservative_ds = tf.data.TextLineDataset('conservative.txt')
liberal_ds = tf.data.TextLineDataset('liberal.txt')

total_lines = 0

with open('conservative.txt', 'r') as c:
    total_lines +=  len(c.readlines())
with open('liberal.txt') as c:
    total_lines +=  len(c.readlines())


dataset = tf.data.Dataset.sample_from_datasets([conservative_ds, liberal_ds])

dataset = text_ds.map(label_data).batch(batch_size)

def get_dataset_partitions_tf(ds, ds_size, train_split=0.9, val_split=0.1, test_split=0, shuffle=True, shuffle_size=10000):

    assert (train_split + test_split + val_split) == 1
    
    if shuffle:
        # Specify seed to always have the same split distribution between runs
        ds = ds.shuffle(shuffle_size, seed=12)
    
    train_size = int(train_split * ds_size)
    val_size = int(val_split * ds_size)
    
    train_ds = ds.take(train_size)    
    val_ds = ds.skip(train_size).take(val_size)
    test_ds = ds.skip(train_size).skip(val_size)
    
    return train_ds, val_ds, test_ds

train_ds, val_ds, _ = get_dataset_partitions_tf(dataset, total_lines)

AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size =  AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size =  AUTOTUNE)


vectorize_layer = TextVectorization(
    max_tokens = vocab_size,
    output_mode = 'int', 
    output_sequence_length = sequence_length)

text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

Found 230710 files belonging to 2 classes.
Using 184568 files for training.
Found 230710 files belonging to 2 classes.
Using 46142 files for validation.


In [None]:




class EmbeddingInitializer(tf.keras.initializers.Initializer):
    def __call__(self, shape=None, dtype=None, **kwargs):
        weights = tf.convert_to_tensor(np.load('w2vVectors.npy'))
        return weights

embedding_layer = Embedding(vocab_size,
                            embedding_dim,
                            embeddings_initializer = EmbeddingInitializer(),
                            mask_zero= True)




model = Sequential([
    vectorize_layer,
    embedding_layer,
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    tf.keras.layers.Dense(1)
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="classifier_logs")

model.fit(train_ds, epochs=25,
                    validation_data=val_ds,
                    validation_steps=30, 
                    callbacks = tensorboard_callback)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f2c26fa2410>

In [None]:

!cd checkpoints/
!mkdir one 
model.save_weights("./checkpoints/one/mycheckpoint")
with open("checkpoints/one/METADATA.txt", 'w') as file:
    file.write("Batch size = " + str(batch_size))
    file.write("\nSeed = " + str(seed))
    file.write("\nSequence length = " + str(sequence_length))
    file.write("\nVocab size = " + str(vocab_size))
    file.write("\nEmbedding dimensions = " + str(embedding_dim))
    

mkdir: cannot create directory ‘checkpoints’: File exists


In [None]:
sentence = 'rasekhi left for iran to visit her family the day after trump was elected president her roommate agatha lyczek said ashdkfahsd asdfkasdhfkja asdhfakhsdfj dakjhfaksjdf adhfjasd sahdfkjasd '
guess = model.predict([sentence])[0][0]
if guess > 0.5:
    label = 'liberal'
else:
    label = 'conservative'

print('The sentence ' + "'" + sentence + "'" + ' is ' + label)

The sentence 'rasekhi left for iran to visit her family the day after trump was elected president her roommate agatha lyczek said ashdkfahsd asdfkasdhfkja asdhfakhsdfj dakjhfaksjdf adhfjasd sahdfkjasd ' is conservative
