In [0]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

In [0]:
DATASET_PATH = "/content/drive/My Drive/ire-proj/processedData"
!ls "$DATASET_PATH"

In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from keras import Sequential, Model, Input
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D, Flatten, Dense, \
                    GlobalAveragePooling1D, Dropout, LSTM, CuDNNLSTM, RNN, SimpleRNN, Conv2D, GlobalMaxPooling1D
from keras import callbacks

import re

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder 
import pickle

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import classification_report

In [0]:
N_TRAINING_SAMPLES = None

N_TEST_SAMPLES = N_TRAINING_SAMPLES // 2 if N_TRAINING_SAMPLES is not None else None

In [0]:
def load_embeddings(word_index, embeddingsfile):
    embeddings_index = {}
    f = open(embeddingsfile, 'r', encoding='utf8')
    for line in f:
        #here we parse the data from the file
        values = line.split(' ') #split the line by spaces
        word = values[0] #each line starts with the word
        coefs = np.asarray(values[1:], dtype='float32') #the rest of the line is the vector
        embeddings_index[word] = coefs #put into embedding dictionary
    f.close()

    print(f'Found {len(embeddings_index)} word vectors.')

    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    
    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
    return embedding_layer

In [0]:
MAX_NB_WORDS = 50000    #dictionary size
MAX_SEQUENCE_LENGTH = 1500  #max word length of each individual article
EMBEDDING_DIM = 300 #dimensionality of the embedding vector (50, 100, 200, 300)
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')

TOKENIZER_DUMP_FILE='tokenizer.p'

def tokenize_trainingdata(X):
    tokenizer.fit_on_texts(X)
    
    with open(TOKENIZER_DUMP_FILE, 'wb') as fp:
        pickle.dump(tokenizer, fp)

    sequences = tokenizer.texts_to_sequences(X)

    word_index = tokenizer.word_index
    print(f'Found {len(word_index)} unique tokens.')

    X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    return X, word_index

def tokenize_testdata(X):
    with open(TOKENIZER_DUMP_FILE, 'rb') as fp:
        tokenizer=pickle.load(fp)

    print(f'Found {len(tokenizer.word_index)} unique tokens.')

    sequences = tokenizer.texts_to_sequences(X)

    X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    return X

In [0]:
def reverse_to_categorical(y):
    return np.argmax(y[:5], axis=1)

# Load datasets

In [0]:
df = pd.read_csv(filepath_or_buffer= DATASET_PATH + '/articles-training-bypublisher.csv',
                 names=['article_id', 'title', 'articleContent', 'bias', 'hyperpartisan'],
                 dtype={'title':str},
                 nrows=N_TRAINING_SAMPLES)

df['title'] = df['title'].fillna(value=' ')
df.count()

In [0]:
def perform_cleaning(text):
    text = text.lower().strip()
    text = ' '.join(e for e in text.split())
    text = re.sub('[^A-Za-z0-9]+', ' ', text)
    return text

df['title'] = df['title'].map(perform_cleaning)
df['articleContent'] = df['articleContent'].map(perform_cleaning)

In [0]:
df.head()
# df.tail(5)

In [0]:
df_test = pd.read_csv(filepath_or_buffer=DATASET_PATH + '/articles-validation-bypublisher.csv',
                 names=['article_id', 'title', 'articleContent', 'bias', 'hyperpartisan'],
                 nrows=N_TEST_SAMPLES
                 )
df_test['title'] = df_test['title'].fillna(value=' ')
df_test.count()

In [0]:
df_test['title'] = df_test['title'].map(perform_cleaning)
df_test['articleContent'] = df_test['articleContent'].map(perform_cleaning)
df_test.tail(5)

In [0]:
print(df['hyperpartisan'].value_counts())
print(df_test['hyperpartisan'].value_counts())

# Define CNN model

In [0]:
# def baseline_model(sequence_input, embedded_sequences, classes=2):
#     x = Conv1D(64, 5, activation='relu')(embedded_sequences)
#     x = MaxPooling1D(5)(x)
#     x = Conv1D(128, 3, activation='relu')(x)
#     x = MaxPooling1D(5)(x)
#     x = Conv1D(256, 2, activation='relu')(x)
#     x = GlobalAveragePooling1D()(x)
#     x = Dense(2048, activation='relu')(x)
#     x = Dropout(0.5)(x)
#     x = Dense(512, activation='relu')(x)
#     x = Dropout(0.5)(x)
#     preds = Dense(classes, activation='softmax')(x)

#     model = Model(sequence_input, preds)
#     return model

In [0]:
# #put embedding layer into input of the model
# sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
# embedded_sequences = embedding_layer(sequence_input)

# model = baseline_model(sequence_input, embedded_sequences, classes=2)

# model.compile(loss='categorical_crossentropy',
#               optimizer='adamax',
#               metrics=['acc'])

# print(model.summary())

In [0]:
# model.fit(X_train, y_train,
#           validation_data=(X_validate, y_validate),
#           epochs=25, batch_size=64)

In [0]:
# y_pred = model.predict(X_test)

In [0]:
# y_pred = np.array([[1,0] if unbiased > biased else [0,1] for unbiased, biased in y_pred], dtype='float32')
# y_pred[:5]

In [0]:
# accuracy_score(y_pred, y_test)

In [0]:
# model.save('cnn.h5')

# Define LSTM model

In [0]:
def LSTM_model(sequence_input, embedded_sequences, classes=2):
    x = CuDNNLSTM(32,
                  return_sequences=True)(embedded_sequences)
    x = CuDNNLSTM(64,
                  return_sequences=True)(x)
    x = CuDNNLSTM(128)(x)
    x = Dense(4096,
              activation='relu')(x)
    x = Dense(1024,
              activation='relu')(x)
    preds = Dense(classes,
              activation='softmax')(x)

    model = Model(sequence_input, preds)
    return model

# Binary classifier (Biased / Unbiased)

## Separate labels from features

In [0]:
X = df.articleContent.values
y_bias = df.hyperpartisan.values
y_bias_kind = df.bias.values

X_test = df_test.articleContent.values
y_test_bias = df_test.hyperpartisan.values
y_test_bias_kind = df_test.bias.values

NUM_CLASSES_BIAS = len(np.unique(y_bias))
NUM_CLASSES_BIAS_KIND = len(np.unique(y_bias_kind))

In [0]:
print(y_bias[:5])
print(y_bias_kind[:5])

## Tokenize data

In [0]:
X, word_index = tokenize_trainingdata(X)
y_bias = to_categorical(y_bias, num_classes=NUM_CLASSES_BIAS)

X_test = tokenize_testdata(X_test)
y_test_bias = to_categorical(y_test_bias, num_classes=NUM_CLASSES_BIAS)

In [0]:
print(y_bias[:5])
print(reverse_to_categorical(y_bias[:5]))

In [0]:
X_train, X_validate, y_train_bias, y_validate_bias = train_test_split(X, y_bias,
                                                            test_size=0.2,
                                                            random_state=12)

In [0]:
#and build the embedding layer
embedding_layer = load_embeddings(word_index, 
                                  f'{DATASET_PATH}/glove.6B.{EMBEDDING_DIM}d.txt')

## Compile model

In [0]:
#put embedding layer into input of the model
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

model = LSTM_model(sequence_input, embedded_sequences, classes=NUM_CLASSES_BIAS)

model.compile(loss='categorical_crossentropy',
              optimizer='adamax',
              metrics=['acc'])

print(model.summary())

## Fit model

In [0]:
model.fit(X_train, y_train_bias,
          validation_data=(X_validate, y_validate_bias),
          epochs=25,
          batch_size=250)

y_pred_bias_validate = model.predict(X_validate)
print(classification_report(np.argmax(y_validate_bias, axis=1),
                            np.argmax(y_pred_bias_validate, axis=1),
                            target_names=['unbiased','biased']))

## Predict

In [0]:
y_pred_bias = model.predict(X_test)
print(y_test_bias[:5])
print(y_pred_bias[:5])

In [0]:
print(classification_report(np.argmax(y_test_bias, axis=1),
                            np.argmax(y_pred_bias, axis=1),
                            target_names=['unbiased','biased']))

In [0]:
ax= plt.subplot()
cm = confusion_matrix(np.argmax(y_test_bias, axis=1), np.argmax(y_pred_bias, axis=1))
sns.heatmap(cm, annot=True, ax=ax, fmt='g')
ax.set_xlabel('Predicted')
ax.set_ylabel('True')

## Save model

In [0]:
model.save('lstm_binary.h5')

# Multiclass classifier (Kind of bias classifier)

## Separate labels from features

In [0]:
print(y_bias_kind[:5])
print(y_test_bias_kind[:5])

## Encode labels

In [0]:
labelEncoder = LabelEncoder()
labelEncoder.fit(np.unique(y_bias_kind))
labelEncoder.classes_

In [0]:
y_bias_kind=labelEncoder.transform(y_bias_kind)
y_test_bias_kind=labelEncoder.transform(y_test_bias_kind)

print(y_bias_kind[:5])
print(y_test_bias_kind[:5])

In [0]:
# Inverse tranform labels
labelEncoder.inverse_transform(y_bias_kind)

In [0]:
y_bias_kind = to_categorical(y_bias_kind, num_classes=NUM_CLASSES_BIAS_KIND)
y_test_bias_kind = to_categorical(y_test_bias_kind, num_classes=NUM_CLASSES_BIAS_KIND)

In [0]:
y_bias_kind[:5]

In [0]:
# TO get Reverse of to_categorical
print(reverse_to_categorical(y_bias_kind))

## Split into train and validate sets

In [0]:
X_train, X_validate, y_train_bias_kind, y_validate_bias_kind = train_test_split(X,
                                                            y_bias_kind,
                                                            test_size=0.2,
                                                            random_state=12)

## Compile model

In [0]:
#put embedding layer into input of the model
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

model = LSTM_model(sequence_input, embedded_sequences, classes=NUM_CLASSES_BIAS_KIND)

model.compile(loss='categorical_crossentropy',
              optimizer='adamax',
              metrics=['acc'])

print(model.summary())

## Fit model

In [0]:
model.fit(X_train, y_train_bias_kind,
          validation_data=(X_validate, y_validate_bias_kind),
          epochs=25,
          batch_size=64)

In [0]:
y_pred_bias_kind_validate = model.predict(X_validate)
print(classification_report(np.argmax(y_validate_bias_kind, axis=1),
                            np.argmax(y_pred_bias_kind_validate, axis=1),
                            target_names=labelEncoder.inverse_transform(reverse_to_categorical(y_train_bias_kind))))

## Predict

In [0]:
y_pred_bias_kind = model.predict(X_test)

In [0]:
y_test_bias_kind[:5]

In [0]:
y_pred_bias_kind[:5]

In [0]:
ax= plt.subplot()
cm = confusion_matrix(np.argmax(y_test_bias_kind, axis=1), np.argmax(y_pred_bias_kind, axis=1))
sns.heatmap(cm, annot=True, ax=ax, fmt='g')
ax.set_xlabel('Predicted')
ax.set_ylabel('True') 

In [0]:
print(classification_report(np.argmax(y_test_bias_kind, axis=1),
                            np.argmax(y_pred_bias_kind, axis=1),
                            target_names=labelEncoder.inverse_transform(reverse_to_categorical(y_train_bias_kind))))

## Save model

In [0]:
model.save('lstm_multiclass.h5')

# Multitask learning 
 - task 1: biased/unbiased (binary)
 - task 2: kind of bias (multiclass)

In [0]:
#put embedding layer into input of the model
input_layer = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')
embed_layer = embedding_layer(input_layer)

x = CuDNNLSTM(32, return_sequences=True)(embed_layer)
x = CuDNNLSTM(64, return_sequences=True)(x)
x = CuDNNLSTM(128)(x)
x = Dense(4096, activation='relu')(x)
x = Dense(1024, activation='relu')(x)

# task 1
output_bias = Dense(2, activation='softmax')(x)

# task 2
output_bias_kind = Dense(5, activation='softmax')(x)

model = Model(input_layer, [output_bias, output_bias_kind])

model.compile(loss='categorical_crossentropy', 
              optimizer='adamax', 
              metrics=['acc'])

print(model.summary())

## Fit model

In [0]:
model.fit(X_train, [y_train_bias, y_train_bias_kind] ,
          validation_data=(X_validate, [y_validate_bias, y_validate_bias_kind]),
          epochs=50,
          batch_size=64)

In [0]:
y_pred_bias_validate, y_pred_bias_kind_validate = model.predict(X_validate)

print(classification_report(np.argmax(y_validate_bias, axis=1),
                            np.argmax(y_pred_bias_validate, axis=1),
                            target_names=['unbiased','biased']))

print(classification_report(np.argmax(y_validate_bias_kind, axis=1),
                            np.argmax(y_pred_bias_kind_validate, axis=1),
                            target_names=labelEncoder.inverse_transform(reverse_to_categorical(y_train_bias_kind))))

## Predict

In [0]:
y_pred_bias, y_pred_bias_kind = model.predict(X_test)

In [0]:
print(classification_report(np.argmax(y_test_bias, axis=1),
                            np.argmax(y_pred_bias, axis=1),
                            target_names=['unbiased','biased']))

print(classification_report(np.argmax(y_test_bias_kind, axis=1),
                            np.argmax(y_pred_bias_kind, axis=1),
                            target_names=labelEncoder.inverse_transform(reverse_to_categorical(y_train_bias_kind))))

In [0]:
ax= plt.subplot()
cm = confusion_matrix(np.argmax(y_test_bias, axis=1), np.argmax(y_pred_bias, axis=1))
sns.heatmap(cm, annot=True, ax=ax, fmt='g')
ax.set_xlabel('Predicted')
ax.set_ylabel('True') 


In [0]:
ax= plt.subplot()
cm = confusion_matrix(np.argmax(y_test_bias_kind, axis=1), np.argmax(y_pred_bias_kind, axis=1))
sns.heatmap(cm, annot=True, ax=ax, fmt='g')
ax.set_xlabel('Predicted')
ax.set_ylabel('True') 

## Save model

In [0]:
model.save('lstm_multitask.h5')