# Define Imports

In [1]:
import sys
import csv
import os
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from keras.layers import Concatenate, Input, Dropout, Dense
from keras.models import Model

# Define Constants

In [2]:
MAX_SENT_LEN = 150
MAX_VOCAB_SIZE = 30000
BATCH_SIZE = 200
HIDDEN_DIM = 100
N_EPOCHS = 10
TEST_SPLIT_SIZE = 0.2
FEATURE_LIMIT = 5000

stance_map = {'agree': 0, 'disagree': 1, 'discuss': 2, 'unrelated': 3}
stance_map_inv = {0: 'agree', 1: 'disagree', 2: 'discuss', 3: 'unrelated'}

# Define Methods to Extract Data

In [3]:
# Reads _bodies.csv file and creates a dictionary from Body ID -> Body Text
def get_body_dict(data_dir):
    with open(data_dir, encoding='utf_8') as tb:
        train_bodies = list(csv.reader(tb))
        train_bodies_dict = {}
        for i, line in enumerate(tqdm(train_bodies)):
            if i > 0:
                id = int(line[0])
                train_bodies_dict[id] = line[1]

    return train_bodies_dict

# Reads _stances.csv file and returns headline, body, stance data
def get_article_data(data_dir, train_bodies_dict):
    with open(data_dir, encoding='utf_8') as ts:
        train_stances = list(csv.reader(ts))

        headlines, bodies, stances = [], [], []

        for i, line in enumerate(tqdm(train_stances)):
            if i > 0:
                body_id = int(line[1].strip())

                stances.append(line[2].strip())
                headlines.append(line[0].strip())
                bodies.append(train_bodies_dict[body_id])
        return stances, headlines, bodies

## Read in CSV Data

In [5]:
print('Reading in CSV data...')
train_bodies_dict = get_body_dict("train_bodies.csv")
train_stances, train_headlines, train_bodies = get_article_data("train_stances.csv", train_bodies_dict)

competition_bodies_dict = get_body_dict("competition_test_bodies.csv")
test_stances, test_headlines, test_bodies = get_article_data("competition_test_stances.csv", competition_bodies_dict)

Reading in CSV data...


100%|██████████| 1684/1684 [00:00<00:00, 496325.48it/s]
100%|██████████| 49973/49973 [00:00<00:00, 665523.46it/s]
100%|██████████| 905/905 [00:00<00:00, 560189.66it/s]
100%|██████████| 25414/25414 [00:00<00:00, 565521.63it/s]


# Build TFIDF Vectorizer and Vectorize Documents

In [6]:
print('Initializing TFIDF Vectorizer...')
# Converts collection of raw documents to a TF-IDF matrix
vectorizer = TfidfVectorizer(max_features=FEATURE_LIMIT)
# Builds vocabulary from training set
vectorizer.fit(train_headlines + train_bodies)

print('Vectorizing Data...')
# Transform documents to document-term matrix
x_train_headlines = vectorizer.transform(train_headlines).toarray()
x_train_bodies = vectorizer.transform(train_bodies).toarray()
x_test_headlines = vectorizer.transform(test_headlines).toarray()
x_test_bodies = vectorizer.transform(test_bodies).toarray()

Initializing TFIDF Vectorizer...
Vectorizing Data...


# Encode Stances and Create Train/Test Split

In [7]:
print('Encoding Stances...')
# Fit encoder and return encoded labels
encoded_train_stances = LabelEncoder().fit_transform(train_stances)
# Transform labels to binary class matrix
y_train = np_utils.to_categorical(encoded_train_stances, num_classes=4)
encoded_test_stances = LabelEncoder().fit_transform(test_stances)
y_test = np_utils.to_categorical(encoded_test_stances, num_classes=4)

print('Creating train/test splits...')
x_train_headlines, x_val_headlines, x_train_bodies, x_val_bodies, y_train, y_val = train_test_split(
  x_train_headlines, x_train_bodies, y_train, test_size=TEST_SPLIT_SIZE)

Encoding Stances...
Creating train/test splits...


# Initialize Model I/O

In [8]:
print('Building Model I/O...')
# Define model input for headlines
input_headlines = Input(shape=(FEATURE_LIMIT,), name='input_headlines')
# Define model input for bodies
input_bodies = Input(shape=(FEATURE_LIMIT,), name='input_bodies')
# Concatenate list of inputs
concatenated_input = Concatenate()([input_headlines, input_bodies])

# Add hidden layer
hidden = Dense(HIDDEN_DIM, activation='sigmoid', name='dense_layer')(concatenated_input)
# Add dropout layer
hidden = Dropout(rate=0.6, name='dropout_layer')(hidden)
# Add output layer
out = Dense(4, activation='softmax', name='output_layer')(hidden)

model = Model(inputs=[input_headlines, input_bodies], outputs=out)

print(model.summary())

Building Model I/O...
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_headlines (InputLayer)   [(None, 5000)]       0           []                               
                                                                                                  
 input_bodies (InputLayer)      [(None, 5000)]       0           []                               
                                                                                                  
 concatenate (Concatenate)      (None, 10000)        0           ['input_headlines[0][0]',        
                                                                  'input_bodies[0][0]']           
                                                                                                  
 dense_layer (Dense)            (None, 100)          1000100     ['conca

# Compile and Fit Model

In [9]:
print('Compiling Model...')
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print('Fitting Model...')
model.fit([x_train_headlines, x_train_bodies], y_train, batch_size=BATCH_SIZE, epochs=N_EPOCHS,
          validation_data=([x_val_headlines, x_val_bodies], y_val))

Compiling Model...
Fitting Model...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fada8acd810>

# Evaluate Model

In [10]:
print('Evaluating Model...')
model.evaluate([x_test_headlines, x_test_bodies], y_test, batch_size=BATCH_SIZE)

Evaluating Model...


[0.8236998915672302, 0.7229371070861816]