In [3]:
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

from pathlib import Path


In [49]:
def load_data_file(data_file, is_test_file = False):
    """Load newsframing data

    Returns
    -------
    tuple
        First element is a list of strings(headlines)
        If `data_file` has labels, the second element
        will be a list of labels for each headline. 
        Otherwise, the second element will be None.
    """
    print("Loading from {} ...".format(data_file.name), end="")
    text_col = "news_title"
    theme1_col = "Q3 Theme1"

    with open(data_file) as f:
        df = pd.read_csv(f, sep="\t")
        X = df[text_col].tolist()

        y = None
        if not is_test_file:
            if theme1_col in df.columns:
                y = df[theme1_col].tolist()

        print(
            "loaded {} lines {} labels ... done".format(
                len(X), "with" if y is not None else "without"
            )
        )

    return (X, y)

def load_data_file_2(data_file, is_test_file = False):
    """Load newsframing data

    Returns
    -------
    tuple
        First element is a list of strings(headlines)
        If `data_file` has labels, the second element
        will be a list of labels for each headline. 
        Otherwise, the second element will be None.
    """
    print("Loading from {} ...".format(data_file.name), end="")
    text_col = "news_title"
    theme1_col = "Q3 Theme1"

    with open(data_file) as f:
        df = pd.read_csv(f, sep="\t")
        X = df[text_col].tolist()

        y = None
        if not is_test_file:
            if theme1_col in df.columns:
                y = df[theme1_col].tolist()

        df = pd.DataFrame({'x': X, 'y': y})
        return df

In [68]:
TRAIN_FILE = Path("raw_data/GunViolence/train.tsv")
BAL_TRAIN_FILE = Path("raw_data/GunViolence/train_balanced.tsv")
DEV_FILE = Path("raw_data/GunViolence/dev.tsv")
TEST_FILE = Path("raw_data/GunViolence/test.tsv")

x_train, y_train = load_data_file(TRAIN_FILE)
x_dev, y_dev = load_data_file(DEV_FILE)
x_test, _ = load_data_file(TEST_FILE, is_test_file=True)

# train_posts = df['post']
# train_tags = df['tags']

# test_posts = df['post'][train_size:]
# test_tags = df['tags'][train_size:]

max_words = 1000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(x_train) # only fit on train

x_train = tokenize.texts_to_matrix(x_train)
x_test = tokenize.texts_to_matrix(x_dev)

encoder = LabelEncoder()
encoder.fit(y_train)
y_train = encoder.transform(y_train)
y_test = encoder.transform(y_dev)

num_classes = np.max(y_train) + 1
print("num_classes", num_classes)
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)

batch_size = 32
epochs = 15

# Build the model
model = Sequential()
model.add(Dense(1024, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

from keras import optimizers


# sgd = optimizers.SGD(lr=0.005, decay=1e-6, momentum=0.9, nesterov=True)
# model.compile(loss='categorical_crossentropy',
#               optimizer=sgd,
#               metrics=['accuracy'])

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])



history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.05)

Loading from train.tsv ...loaded 1040 lines with labels ... done
Loading from dev.tsv ...loaded 130 lines with labels ... done
Loading from test.tsv ...loaded 130 lines without labels ... done
num_classes 9
Train on 988 samples, validate on 52 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [69]:
model.evaluate(x_test, y_test)



[1.112426992563101, 0.6307692527770996]