In [2]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
DATASET_PATH = "/content/drive/My Drive/ire-proj/processedData"
!ls "$DATASET_PATH"

articles-training-byarticle.csv    glove.6B.300d.txt
articles-training-bypublisher.csv


In [0]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv(filepath_or_buffer= DATASET_PATH + '/articles-training-byarticle.csv',
                 names=['article_id', 'title', 'articleContent', 'hyperpartisan'])
df.head()

Unnamed: 0,article_id,title,articleContent,hyperpartisan
0,0,Kucinich: Reclaiming the money power,From flickr.com: Money {MID-161793} Money ( I...,True
1,1,Trump Just Woke Up & Viciously Attacked Puerto...,Donald Trump ran on many braggadocios and lar...,True
2,2,"Liberals wailing about gun control, but what a...",Photo By Justin Sullivan/Getty Images In resp...,True
3,3,Laremy Tunsil joins NFL players in kneeling du...,After Colin Kaepernick rightly chose to kneel ...,True
4,4,It's 1968 All Over Again,"Almost a half-century ago, in 1968, the United...",False


In [6]:
df.tail()

Unnamed: 0,article_id,title,articleContent,hyperpartisan
640,640,Trump Turns his Back on American Workers,Donald Trump. Photo from whitehouse.gov. MADI...,True
641,641,"Cummins: Rescinding DACA ‘discriminatory, harm...",President Donald Trump on Tuesday began disman...,False
642,642,"Trump travel ban can be enforced, says US Supr...",The US Supreme Court has ruled that Donald Tru...,False
643,643,VIDEO- AG SESSIONS: Comey Went Rogue In Hillar...,"Ex-FBI Director James Comey went rogue, accord...",False
644,644,Hollywood Actors Who Condemn Trump but Were Si...,Ashley Judd is the absolute worst. I want to l...,True


In [7]:
len(df[df.hyperpartisan==True]), len(df[df.hyperpartisan==False])

(238, 407)

In [8]:
X = df.articleContent.values
y = df.hyperpartisan.values
X[:1]

array([' From flickr.com: Money {MID-161793} Money ( Image by 401(K) 2013 ) Permission Details DMCA No Pill Can Stop Tinnitus, But This 1 Weird Trick Can The walls are closing in on Congress. Terrifying walls of water from Hurricanes Harvey and Irma, which, when the damage is totaled, could rise to a half trillion dollars. The Walls of War: The multi-trillion dollar ongoing cost of Afghanistan, Iraq and other interventions. The crumbling walls of the U.S. infrastructure, which need at least $3 trillion to be repaired or replaced. A wall of 11 million undocumented immigrants, whose deportation could easily cost $200 billion. The planned wall at the Mexican border, which some estimates place at $67 billion. Then there is the Wall of All, the $20 trillion national debt. The walls of debt are closing in. At moments of crisis in our nation, in addition to invoking the assistance of Higher powers, we can call upon the Constitution for guidance. Article I, Section 8, of the U.S. Constitution 

In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras.utils import to_categorical
import pickle

MAX_NB_WORDS=50000 #dictionary size
MAX_SEQUENCE_LENGTH=1500 #max word length of each individual article
EMBEDDING_DIM=300 #dimensionality of the embedding vector (50, 100, 200, 300)
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')

def tokenize_trainingdata(texts, labels):
    tokenizer.fit_on_texts(texts)
    pickle.dump(tokenizer, open('tokenizer.p', 'wb'))

    sequences = tokenizer.texts_to_sequences(texts)

    word_index = tokenizer.word_index
    print(f'Found {len(word_index)} unique tokens.')

    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    labels = to_categorical(labels, num_classes=len(set(labels)))

    return data, labels, word_index

X, y, word_index = tokenize_trainingdata(X, y)    


Using TensorFlow backend.


Found 22423 unique tokens.


In [0]:
#split the data (90% train, 5% test, 5% validation)
X_train = X[:int(len(X)*0.9)]
y_train = y[:int(len(X)*0.9)]
X_test = X[int(len(X)*0.9):int(len(X)*0.95)]
y_test = y[int(len(X)*0.9):int(len(X)*0.95)]
X_validate = X[int(len(X)*0.95):]
y_validate = y[int(len(X)*0.95):]

In [11]:
y[:5]   # [0,1] => True(Biased) ; [1,0] => False(Unbiased)

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.]], dtype=float32)

In [12]:
def load_embeddings(word_index, embeddingsfile):
    embeddings_index = {}
    f = open(embeddingsfile, 'r', encoding='utf8')
    for line in f:
        #here we parse the data from the file
        values = line.split(' ') #split the line by spaces
        word = values[0] #each line starts with the word
        coefs = np.asarray(values[1:], dtype='float32') #the rest of the line is the vector
        embeddings_index[word] = coefs #put into embedding dictionary
    f.close()

    print(f'Found {len(embeddings_index)} word vectors.')

    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    
    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
    return embedding_layer

#and build the embedding layer
embedding_layer = load_embeddings(word_index, 
                                  f'{DATASET_PATH}/glove.6B.{EMBEDDING_DIM}d.txt')

Found 400000 word vectors.



In [0]:
from keras import Sequential, Model, Input
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D, Flatten, Dense, \
                    GlobalAveragePooling1D, Dropout, LSTM, CuDNNLSTM, RNN, SimpleRNN, Conv2D, GlobalMaxPooling1D
from keras import callbacks

def baseline_model(sequence_input, embedded_sequences, classes=2):
    x = Conv1D(64, 5, activation='relu')(embedded_sequences)
    x = MaxPooling1D(5)(x)
    x = Conv1D(128, 3, activation='relu')(x)
    x = MaxPooling1D(5)(x)
    x = Conv1D(256, 2, activation='relu')(x)
    x = GlobalAveragePooling1D()(x)
    x = Dense(2048, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.5)(x)
    preds = Dense(classes, activation='softmax')(x)

    model = Model(sequence_input, preds)
    return model


def LSTM_model(sequence_input, embedded_sequences, classes=2):
    x = CuDNNLSTM(32,
                  return_sequences=True)(embedded_sequences)
    x = CuDNNLSTM(64,
                  return_sequences=True)(x)
    x = CuDNNLSTM(128)(x)
    x = Dense(4096,
              activation='relu')(x)
    x = Dense(1024,
              activation='relu')(x)
    preds = Dense(classes,
              activation='softmax')(x)

    model = Model(sequence_input, preds)
    return model

In [14]:
#put embedding layer into input of the model
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

model = baseline_model(sequence_input, embedded_sequences, classes=2)
# TODO: Test LSTM Model
# model = LSTM_model(sequence_input, embedded_sequences, classes=2)

model.compile(loss='categorical_crossentropy',
              optimizer='adamax',
              metrics=['acc'])

print(model.summary())

model.fit(X_train, y_train,
          validation_data=(X_validate, y_validate),
          epochs=25, batch_size=64)











Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1500)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1500, 300)         6727200   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1496, 64)          96064     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 299, 64)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 297, 128)          24704     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 59, 128)           0        

<keras.callbacks.History at 0x7f161e91c3c8>

In [15]:
#put embedding layer into input of the model
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

model = LSTM_model(sequence_input, embedded_sequences, classes=2)

model.compile(loss='categorical_crossentropy',
              optimizer='adamax',
              metrics=['acc'])

print(model.summary())

model.fit(X_train, y_train,
          validation_data=(X_validate, y_validate),
          epochs=25, batch_size=64)

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 1500)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1500, 300)         6727200   
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (None, 1500, 32)          42752     
_________________________________________________________________
cu_dnnlstm_2 (CuDNNLSTM)     (None, 1500, 64)          25088     
_________________________________________________________________
cu_dnnlstm_3 (CuDNNLSTM)     (None, 128)               99328     
_________________________________________________________________
dense_4 (Dense)              (None, 4096)              528384    
_________________________________________________________________
dense_5 (Dense)              (None, 1024)              4195

<keras.callbacks.History at 0x7f161e8fd128>

In [0]:
y_pred = model.predict(X_test)