In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

!pip install -q tensorflow-hub
import tensorflow as tf
import tensorflow_hub as hub
print("Version: ", tf.__version__)

# Preprocessing
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Model
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, LSTM, Embedding, Dropout
from tensorflow.keras.models import Sequential

# Loading
from tensorflow.keras.models import load_model
from tensorflow.keras.models import model_from_json

# Evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

os.chdir("../")
from lib.utils import load_data
os.chdir("notebooks/")

Version:  2.1.0


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\quort\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\quort\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\quort\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [4]:
df = load_data()
df.head()

Unnamed: 0,Body,Bias,cleaned_body
0,Abortion rights advocates have asked the U.S. ...,1.67,abortion right advocate asked u supreme court ...
1,A federal appeals court rejected the most dire...,0.67,federal appeal court rejected direct constitut...
2,As part of the Trump administration's effort t...,-2.75,part trump administration effort slow migrant ...
3,"President Donald Trump and ""the Trump of the T...",-4.33,president donald trump trump tropic brazilian ...
4,"U.S Senator Elizabeth Warren, who is competing...",-10.0,u senator elizabeth warren competing democrati...


## DATA 

In [5]:
df.loc[df['Bias'] <= 0, 'Bias'] = 0
df.loc[df['Bias'] > 0, 'Bias'] = 1

In [6]:
df = df.drop(columns=['Body'])

In [7]:
train_set, testval_set = train_test_split(df)
val_set, test_set = train_test_split(testval_set, train_size=0.5)

X_train = train_set['cleaned_body']
X_val = val_set['cleaned_body']
X_test = test_set['cleaned_body']

y_train = train_set['Bias'].values
y_val = val_set['Bias'].values
y_test = test_set['Bias'].values

In [8]:
# Tokenizing
MAX_VOCAB_SIZE = 20000

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(X_train)
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_val = tokenizer.texts_to_sequences(X_val)
sequences_test = tokenizer.texts_to_sequences(X_test)

In [9]:
# Padding sequences
#X_train_pad = pad_sequences(sequences_train)

#MAX_LEN = X_train_pad.shape[1]
MAX_LEN = 2000

X_train_pad = pad_sequences(sequences_train, maxlen=MAX_LEN)
X_val_pad = pad_sequences(sequences_val, maxlen=MAX_LEN)
X_test_pad = pad_sequences(sequences_test, maxlen=MAX_LEN)

print('Training shape: ', X_train_pad.shape)
print('Validation shape: ', X_val_pad.shape)
print('Testing shape: ', X_test_pad.shape)

Training shape:  (1256, 2000)
Validation shape:  (209, 2000)
Testing shape:  (210, 2000)


In [10]:
def trim_dataset(mat, batch_size):
    """
    trims dataset to a size that's divisible by the batch size
    """

    no_of_rows_drop = mat.shape[0] % batch_size
    if(no_of_rows_drop > 0):
        return mat[:-no_of_rows_drop]
    else:
        return mat

In [11]:
dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))

In [12]:
testdataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

## MODEL

In [53]:
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                           dtype=tf.string, trainable=True)
hub_layer(X_train)

<tf.Tensor: shape=(1256, 20), dtype=float32, numpy=
array([[-6.7498541e-01, -2.9683700e+00,  9.1933184e+00, ...,
        -3.7393546e+00, -3.6545901e+00,  2.2885122e-03],
       [ 4.5606480e+00,  3.1452041e+00,  2.3387189e+00, ...,
        -5.3740139e+00, -4.0858994e+00, -4.7504001e+00],
       [ 3.3300530e-02, -2.1278386e+00,  3.6390228e+00, ...,
        -5.1505122e+00, -9.9419880e-01, -1.3399523e+00],
       ...,
       [ 1.3918315e+00, -2.3847456e+00,  1.4652982e+01, ...,
        -8.8711891e+00, -6.9092298e+00,  1.6106667e-01],
       [ 1.7348050e+00, -2.4183793e+00,  4.2662964e+00, ...,
        -2.1665649e+01,  4.1643882e+00, -8.3186274e+00],
       [ 5.4037565e-01, -3.6619914e+00,  4.0706768e+00, ...,
        -4.8490100e+00, -3.5005054e+00, -5.4608405e-01]], dtype=float32)>

In [54]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1))

model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer_4 (KerasLayer)   (None, 20)                400020    
_________________________________________________________________
dense_8 (Dense)              (None, 16)                336       
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 17        
Total params: 400,373
Trainable params: 400,373
Non-trainable params: 0
_________________________________________________________________


In [55]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [56]:
train_dataset = dataset.shuffle(len(df)).batch(1)

In [None]:
model.fit(train_dataset, epochs = 5)

Train for 1256 steps
Epoch 1/5
Epoch 2/5
Epoch 3/5

In [None]:
model.evaluate(X_test, y_test)

In [None]:
y_pred = model.predict_classes(X_test)
y_pred = [i for sublist in y_pred for i in sublist]

In [None]:
roc_auc_score(y_test, y_pred)