In [1]:
import pandas as pd
import gensim

DATA_PATH = '../data/'
DATA_INTERIM_PATH = DATA_PATH + 'interim/'
train = pd.read_csv(DATA_INTERIM_PATH + 'train_p.csv')
val = pd.read_csv(DATA_INTERIM_PATH + 'val_p.csv')

In [2]:
import sys
sys.path.append('../src')

%load_ext autoreload
%autoreload 1

from models.feature_spaces import create_tfidf, create_avg_word_embeddings
%aimport models.feature_spaces

from datatasks.sample_data import sample_data

from datatasks.new_preprocess import tokenize
%aimport datatasks.new_preprocess

from models.pipeline import make_features_pipeline
%aimport models.pipeline

from models.models import run_models
%aimport models.models

from models.plot import plot_LSA

import matplotlib.pyplot as plt
%matplotlib inline

  from numpy.core.umath_tests import inner1d


In [14]:
train_s = sample_data(train, 50000, 'train')
val_s = sample_data(val, 10000, 'val')

In [4]:
train_s.shape

(997, 16)

In [15]:
train_s = tokenize(train_s)
val_s = tokenize(val_s)

In [6]:
word2vec_path = "../data/external/GoogleNews-vectors-negative300.bin.gz"
transformer = create_avg_word_embeddings(word2vec_path)

In [7]:
tfidf_vectorizer = create_tfidf()

In [16]:
feats = make_features_pipeline(transformer, 'tokens')

In [17]:
X_train = train_s.drop('hyperpartisan', axis=1)
y_train = train_s['hyperpartisan']
X_test = val_s.drop('hyperpartisan', axis=1)
y_test = val_s['hyperpartisan']

In [18]:
model_list = ['lr']
best_tfidf_model, best_tfidf_model_type, best_tfidf_model_predictions = run_models(feats, model_list, X_train, X_test, y_train, y_test)

(49997, 16) (49997,)
Logistic Regression
             precision    recall  f1-score   support

      False       0.72      0.43      0.54      4999
       True       0.59      0.84      0.70      5000

avg / total       0.66      0.63      0.62      9999

Accuracy: 0.6330
Best model is Logistic Regression with an accuracy score of 0.6330


In [19]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

all_words = [word for tokens in train_s["tokens"] for word in tokens]
sentence_lengths = [len(tokens) for tokens in train_s["tokens"]]
VOCAB = sorted(list(set(all_words)))

Using TensorFlow backend.


In [23]:
import gensim

vectors = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [32]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np

MAX_SEQUENCE_LENGTH = 35
EMBEDDING_DIM = 300
VALIDATION_SPLIT=.2

def create_cnn_data(df, vectors):
    
    all_words = [word for tokens in train_s["tokens"] for word in tokens]
    VOCAB = sorted(list(set(all_words)))
    VOCAB_SIZE = len(VOCAB)

    tokenizer = Tokenizer(num_words=VOCAB_SIZE)
    tokenizer.fit_on_texts(df["preprocessed_text"].tolist())
    sequences = tokenizer.texts_to_sequences(df["preprocessed_text"].tolist())

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    cnn_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    labels = to_categorical(np.asarray(df["hyperpartisan"]))

    indices = np.arange(cnn_data.shape[0])
    np.random.shuffle(indices)
    cnn_data = cnn_data[indices]
    labels = labels[indices]
    num_validation_samples = int(VALIDATION_SPLIT * cnn_data.shape[0])

    embedding_weights = np.zeros((len(word_index)+1, EMBEDDING_DIM))
    for word,index in word_index.items():
        embedding_weights[index,:] = vectors[word] if word in vectors else np.random.rand(EMBEDDING_DIM)
    print(embedding_weights.shape)
    
    return cnn_data, labels, embedding_weights, word_index,

In [44]:
cnn_data, labels, embedding_weights, word_index = create_cnn_data(train_s, vectors)

Found 275014 unique tokens.
(275015, 300)


In [80]:
from keras.layers import Dense, Input, Flatten, Dropout, Concatenate
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers import LSTM, Bidirectional
from keras.models import Model

def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index, trainable=False, extra_conv=True):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=trainable)

    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    # Yoon Kim model (https://arxiv.org/abs/1408.5882)
    convs = []
    filter_sizes = [3,4,5]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=128, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = MaxPooling1D(pool_size=3)(l_conv)
        convs.append(l_pool)

    l_merge = Concatenate(axis=1)(convs)

    # add a 1D convnet with global maxpooling, instead of Yoon Kim model
    conv = Conv1D(filters=128, kernel_size=3, activation='relu')(embedded_sequences)
    pool = MaxPooling1D(pool_size=3)(conv)

    if extra_conv==True:
        x = Dropout(0.5)(l_merge)  
    else:
        # Original Yoon Kim model
        x = Dropout(0.5)(pool)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    #x = Dropout(0.5)(x)

    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])

    return model

In [81]:
x_train = cnn_data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = cnn_data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

model = ConvNet(embedding_weights, MAX_SEQUENCE_LENGTH, len(word_index)+1, EMBEDDING_DIM, 
                len(list(train["hyperpartisan"].unique())), False)

In [82]:
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=3, batch_size=128)

Train on 39998 samples, validate on 9999 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1c25ca5ba8>