In [1]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
DATASET_PATH = "/content/drive/My Drive/ire-proj/processedData"
MODEL_PATH = "/content/drive/My Drive/ire-proj/models"

!ls "$DATASET_PATH"

print()

!ls "$MODEL_PATH"

articles-training-byarticle.csv        articles-validation-bypublisher.csv
articles-training-bypublisher.csv      glove.6B.300d.txt
articles-training-bypublisher-old.csv

lstm_binary.h5		partial_lstm_multiclass.h5
partial_lstm_binary.h5	partial_lstm_multitask3.h5


In [0]:
# Path of fully trained models
BINARY_MODEL = f"{MODEL_PATH}/partial_lstm_binary.h5"
MULTICLASS_MODEL = f"{MODEL_PATH}/partial_lstm_multiclass.h5"

MULTITASK_MODEL = f"{MODEL_PATH}/partial_lstm_multitask3.h5"

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from keras import Sequential, Model, Input
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D, Flatten, Dense, \
                    GlobalAveragePooling1D, Dropout, LSTM, CuDNNLSTM, RNN, SimpleRNN, Conv2D, GlobalMaxPooling1D
from keras import callbacks

import re

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder 
import pickle

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import classification_report

from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from numpy.testing import assert_allclose

Using TensorFlow backend.


In [0]:
N_TRAINING_SAMPLES = None

N_TEST_SAMPLES = N_TRAINING_SAMPLES // 2 if N_TRAINING_SAMPLES is not None else None

# Preparing text data
Format text samples and labels into tensors that can be fed into a neural network.
- keras.preprocessing.text.Tokenizer
- keras.preprocessing.sequence.pad_sequences

In [0]:
# Source: https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

MAX_NUM_WORDS = 40000    # dictionary size
MAX_SEQUENCE_LENGTH = 1500  # max word length of each individual article
EMBEDDING_DIM = 300 # dimensionality of the embedding vector (50, 100, 200, 300)

TOKENIZER_DUMP_FILE='tokenizer.p'

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS,
                      filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')

def tokenize_trainingdata(X):
    tokenizer.fit_on_texts(X)

    sequences = tokenizer.texts_to_sequences(X)

    word_index = tokenizer.word_index
    print(f'Found {len(word_index)} unique tokens.')

    X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    with open(TOKENIZER_DUMP_FILE, 'wb') as fp:
        pickle.dump(tokenizer, fp)

    return X, word_index

def tokenize_testdata(X):
    with open(TOKENIZER_DUMP_FILE, 'rb') as fp:
        tokenizer=pickle.load(fp)

    print(f'Found {len(tokenizer.word_index)} unique tokens.')

    sequences = tokenizer.texts_to_sequences(X)

    X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    return X

In [0]:
def reverse_to_categorical(y):
    return np.argmax(y[:5], axis=1)

# Preparing the embedding layer

In [0]:
def load_embeddings(word_index, GLOVE_FILE_PATH):
    # Load glove word embeddings
    embeddings_index = {}
    f = open(GLOVE_FILE_PATH, 'r', encoding='utf8')
    for line in f:
        # each line starts with a word; rest of the line is the vector
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print(f'Found {len(embeddings_index)} word vectors in glove file.')

    # Now use embedding_index dictionary and our word_index 
    # to compute our embedding matrix
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    print("embedding_matrix shape:", np.shape(embedding_matrix))

    # load pre-trained word embeddings into an Embedding layer
    # note that we set trainable = False so as to keep the embeddings fixed
    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
    return embedding_layer

# Load datasets

In [9]:
df = pd.read_csv(filepath_or_buffer= DATASET_PATH + '/articles-training-bypublisher.csv',
                 names=['article_id', 'title', 'articleContent', 'bias', 'hyperpartisan'],
                 dtype={'title':str},
                 nrows=N_TRAINING_SAMPLES)

df['title'] = df['title'].fillna(value=' ')
df.count()

article_id        600000
title             600000
articleContent    600000
bias              600000
hyperpartisan     600000
dtype: int64

In [0]:
def perform_cleaning(text):
    text = text.lower().strip()
    text = ' '.join(e for e in text.split())
    text = re.sub('[^A-Za-z0-9]+', ' ', text)
    return text

df['title'] = df['title'].map(perform_cleaning)
df['articleContent'] = df['articleContent'].map(perform_cleaning)

In [11]:
df.head()
# df.tail(5)

Unnamed: 0,article_id,title,articleContent,bias,hyperpartisan
0,1,after devos announced plans to reexamine title...,when explaining her decision to reevaluate tit...,right,True
1,2,university to award trayvon martin with posthu...,a florida university will honor trayvon martin...,right,True
2,8,texas state university suspends greek life aft...,nov 15 upi texas state university has suspende...,right-center,False
3,12,jewish organization s huge day of unity on tue...,against the backdrop of an increasingly polari...,right,True
4,15,breaking trump reaches agreement to keep 1 000...,president elect donald trump has reached an ag...,right,True


In [12]:
df_test = pd.read_csv(filepath_or_buffer=DATASET_PATH + '/articles-validation-bypublisher.csv',
                 names=['article_id', 'title', 'articleContent', 'bias', 'hyperpartisan'],
                 nrows=N_TEST_SAMPLES
                 )
df_test['title'] = df_test['title'].fillna(value=' ')
df_test.count()

article_id        150000
title             150000
articleContent    150000
bias              150000
hyperpartisan     150000
dtype: int64

In [13]:
df_test['title'] = df_test['title'].map(perform_cleaning)
df_test['articleContent'] = df_test['articleContent'].map(perform_cleaning)
df_test.tail(5)

Unnamed: 0,article_id,title,articleContent,bias,hyperpartisan
149995,1494825,,by andrew osborn moscow reuters russia is quie...,left,True
149996,1494857,i now pronounce you spouse and spouse,in keeping with its reputation of pioneering s...,right,True
149997,1494877,it s now clear that only a democrat can stop d...,donald trump s ongoing evisceration of the rep...,left,True
149998,1494883,the liberal redneck my proudest moment as a de...,lr the liberal redneck here coming to you afte...,left,True
149999,1494893,obama s victory fourth global press roundup,from watching america com here s another delug...,least,False


In [14]:
print(df['hyperpartisan'].value_counts())
print(df_test['hyperpartisan'].value_counts())

True     300000
False    300000
Name: hyperpartisan, dtype: int64
True     75000
False    75000
Name: hyperpartisan, dtype: int64


# Binary classifier (Biased / Unbiased)

## Separate labels from features

In [0]:
X = df.articleContent.values
y_bias = df.hyperpartisan.values
y_bias_kind = df.bias.values

X_test = df_test.articleContent.values
y_test_bias = df_test.hyperpartisan.values
y_test_bias_kind = df_test.bias.values

NUM_CLASSES_BIAS = len(np.unique(y_bias))
NUM_CLASSES_BIAS_KIND = len(np.unique(y_bias_kind))

In [16]:
print(y_bias[:5])
print(y_bias_kind[:5])

[ True  True False  True  True]
['right' 'right' 'right-center' 'right' 'right']


## Tokenize data

In [17]:
X, word_index = tokenize_trainingdata(X)
y_bias = to_categorical(y_bias, num_classes=NUM_CLASSES_BIAS)

X_test = tokenize_testdata(X_test)
y_test_bias = to_categorical(y_test_bias, num_classes=NUM_CLASSES_BIAS)

Found 700689 unique tokens.
Found 700689 unique tokens.


In [18]:
print(y_bias[:5])
print(reverse_to_categorical(y_bias[:5]))

[[0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]]
[1 1 0 1 1]


In [0]:
X_train, X_validate, y_train_bias, y_validate_bias = train_test_split(X, y_bias,
                                                            test_size=0.2,
                                                            random_state=12)

# Predict Using LSTM Binary

In [20]:
model = load_model(BINARY_MODEL)





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.








Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




## On validation set

In [0]:
y_pred_bias_validate = model.predict(X_validate)
print(classification_report(np.argmax(y_validate_bias, axis=1),
                            np.argmax(y_pred_bias_validate, axis=1),
                            target_names=['unbiased','biased']))

```
              precision    recall  f1-score   support

    unbiased       0.94      0.91      0.93     59919
      biased       0.91      0.95      0.93     60081

    accuracy                           0.93    120000
   macro avg       0.93      0.93      0.93    120000
weighted avg       0.93      0.93      0.93    120000

```

In [0]:
ax= plt.subplot()
cm = confusion_matrix(np.argmax(y_pred_bias_validate, axis=1),
                      np.argmax(y_pred_bias, axis=1))
sns.heatmap(cm, annot=True, ax=ax, fmt='g')
ax.set_xlabel('Predicted')
ax.set_ylabel('True')

## On test set

In [0]:
y_pred_bias = model.predict(X_test)
print(y_test_bias[:5])
print(y_pred_bias[:5])

In [0]:
print(classification_report(np.argmax(y_test_bias, axis=1),
                            np.argmax(y_pred_bias, axis=1),
                            target_names=['unbiased','biased']))

```
              precision    recall  f1-score   support

    unbiased       0.65      0.31      0.42     75000
      biased       0.55      0.84      0.66     75000

    accuracy                           0.57    150000
   macro avg       0.60      0.57      0.54    150000
weighted avg       0.60      0.57      0.54    150000

```

In [0]:
ax= plt.subplot()
cm = confusion_matrix(np.argmax(y_test_bias, axis=1), np.argmax(y_pred_bias, axis=1))
sns.heatmap(cm, annot=True, ax=ax, fmt='g')
ax.set_xlabel('Predicted')
ax.set_ylabel('True')

# Predict Using LSTM multiclass (Kind of bias classifier)

In [0]:
print(y_bias_kind[:5])
print(y_test_bias_kind[:5])

## Encode labels

In [0]:
labelEncoder = LabelEncoder()
labelEncoder.fit(np.unique(y_bias_kind))
labelEncoder.classes_

In [0]:
y_bias_kind = labelEncoder.transform(y_bias_kind)
y_test_bias_kind = labelEncoder.transform(y_test_bias_kind)

print(y_bias_kind[:5])
print(y_test_bias_kind[:5])

In [0]:
# Inverse tranform labels
labelEncoder.inverse_transform(y_bias_kind[:5])

In [0]:
y_bias_kind = to_categorical(y_bias_kind, num_classes=NUM_CLASSES_BIAS_KIND)
y_test_bias_kind = to_categorical(y_test_bias_kind, num_classes=NUM_CLASSES_BIAS_KIND)

In [0]:
y_bias_kind[:5]

In [0]:
# TO get Reverse of to_categorical
print(reverse_to_categorical(y_bias_kind))

## Split into train and validate sets

In [0]:
X_train, X_validate, y_train_bias_kind, y_validate_bias_kind = train_test_split(X,
                                                            y_bias_kind,
                                                            test_size=0.2,
                                                            random_state=12)

## Load Model

In [0]:
model = load_model(MULTICLASS_MODEL)

## On validation set

In [0]:
y_pred_bias_kind_validate = model.predict(X_validate)
print(classification_report(np.argmax(y_validate_bias_kind, axis=1),
                            np.argmax(y_pred_bias_kind_validate, axis=1),
                            target_names=labelEncoder.classes_))

```
              precision    recall  f1-score   support

       least       0.98      0.96      0.97     37297
        left       0.84      0.86      0.85     30104
 left-center       0.81      0.82      0.81     14154
       right       0.89      0.91      0.90     29977
right-center       0.89      0.76      0.82      8468

    accuracy                           0.89    120000
   macro avg       0.88      0.86      0.87    120000
weighted avg       0.89      0.89      0.89    120000
```

In [0]:
ax= plt.subplot()
cm = confusion_matrix(np.argmax(y_validate_bias_kind, axis=1), 
                      np.argmax(y_pred_bias_kind_validate, axis=1))
sns.heatmap(cm, annot=True, ax=ax, fmt='g')
ax.set_xlabel('Predicted')
ax.set_ylabel('True') 

## On test set

In [0]:
y_pred_bias_kind = model.predict(X_test)

In [0]:
y_test_bias_kind[:5]

In [0]:
y_pred_bias_kind[:5]

In [0]:
ax= plt.subplot()
cm = confusion_matrix(np.argmax(y_test_bias_kind, axis=1), 
                      np.argmax(y_pred_bias_kind, axis=1))
sns.heatmap(cm, annot=True, ax=ax, fmt='g')
ax.set_xlabel('Predicted')
ax.set_ylabel('True') 

In [0]:
print(classification_report(np.argmax(y_test_bias_kind, axis=1),
                            np.argmax(y_pred_bias_kind, axis=1),
                            target_names=labelEncoder.classes_))

```
              precision    recall  f1-score   support

       least       0.23      0.04      0.07     38296
        left       0.24      0.54      0.34     37500
 left-center       0.27      0.20      0.23     23473
       right       0.45      0.46      0.46     37500
right-center       0.03      0.01      0.01     13231

    accuracy                           0.29    150000
   macro avg       0.25      0.25      0.22    150000
weighted avg       0.28      0.29      0.25    150000
```

# Predict using Multitask model
 - task 1: biased/unbiased (binary)
 - task 2: kind of bias (multiclass)

## Load model

In [0]:
model = load_model(MULTITASK_MODEL)

## Prediction on validation set

In [0]:
y_pred_bias_validate, y_pred_bias_kind_validate = model.predict(X_validate)

print(classification_report(np.argmax(y_validate_bias, axis=1),
                            np.argmax(y_pred_bias_validate, axis=1),
                            target_names=['unbiased','biased']))

print(classification_report(np.argmax(y_validate_bias_kind, axis=1),
                            np.argmax(y_pred_bias_kind_validate, axis=1),
                            target_names=labelEncoder.classes_))

In [0]:
ax= plt.subplot()
cm = confusion_matrix(np.argmax(y_validate_bias, axis=1), 
                      np.argmax(y_pred_bias_validate, axis=1))
sns.heatmap(cm, annot=True, ax=ax, fmt='g')
ax.set_xlabel('Predicted')
ax.set_ylabel('True') 

In [0]:
ax= plt.subplot()
cm = confusion_matrix(np.argmax(y_validate_bias_kind, axis=1),
                      np.argmax(y_pred_bias_kind_validate, axis=1))
sns.heatmap(cm, annot=True, ax=ax, fmt='g')
ax.set_xlabel('Predicted')
ax.set_ylabel('True') 

## Prediction on test set

In [0]:
y_pred_bias, y_pred_bias_kind = model.predict(X_test)

In [0]:
print(classification_report(np.argmax(y_test_bias, axis=1),
                            np.argmax(y_pred_bias, axis=1),
                            target_names=['unbiased','biased']))

print(classification_report(np.argmax(y_test_bias_kind, axis=1),
                            np.argmax(y_pred_bias_kind, axis=1),
                            target_names=labelEncoder.classes_))

In [0]:
ax= plt.subplot()
cm = confusion_matrix(np.argmax(y_test_bias, axis=1), np.argmax(y_pred_bias, axis=1))
sns.heatmap(cm, annot=True, ax=ax, fmt='g')
ax.set_xlabel('Predicted')
ax.set_ylabel('True') 

In [0]:
ax= plt.subplot()
cm = confusion_matrix(np.argmax(y_test_bias_kind, axis=1), np.argmax(y_pred_bias_kind, axis=1))
sns.heatmap(cm, annot=True, ax=ax, fmt='g')
ax.set_xlabel('Predicted')
ax.set_ylabel('True') 