In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import spacy

from keras.models import Sequential
from keras import optimizers
from keras.layers import Dense, LSTM, Bidirectional, Dropout, BatchNormalization
from keras.utils import to_categorical

from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, f1_score, accuracy_score

import matplotlib.pyplot as plt

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [11]:
path = '/Users/andradea/Documents/languages/en_US/'
N = 10000
tweets = pd.read_csv(path + 'en_US.twitter.txt', header=None, delimiter='\n', nrows=N)
blogs = pd.read_csv(path + 'en_US.blogs.txt', header=None, delimiter='\n', nrows=N)
news = pd.read_csv(path + 'en_US.news.txt', header=None, delimiter='\n', nrows=N)
papers = pd.read_csv(path + 'papers.csv', usecols=['abstract'])

In [12]:
tweets = tweets[0]
blogs = blogs[0]
news = news[0]
papers = [p for p in papers['abstract'] if p != 'Abstract Missing']

In [13]:
max_len = 20
vec_size = 300

In [14]:
nlp = spacy.load('en_vectors_web_lg', disable=['parser', 'tagger', 'ner'])
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [15]:
sen_len = []
tweet_sen = 0
for tweet in tweets:
    tokens  = nlp(tweet)
    for sentence in tokens.sents:
        sen_len.append(len(sentence))
        tweet_sen += 1
print('average number of words per sentence:', np.mean(sen_len))

average number of words per sentence: 9.745479400981793


In [16]:
sen_len = []
blog_sen = 0
for blog in blogs:
    tokens  = nlp(blog)
    for sentence in tokens.sents:
        sen_len.append(len(sentence))
        blog_sen += 1
print('average number of words per sentence:', np.mean(sen_len))

average number of words per sentence: 18.44062744513581


In [17]:
sen_len = []
news_sen = 0
for new in news:
    tokens  = nlp(new)
    for sentence in tokens.sents:
        sen_len.append(len(sentence))
        news_sen += 1
print('average number of words per sentence:', np.mean(sen_len))

average number of words per sentence: 21.204551305153682


In [18]:
sen_len = []
paper_sen = 0
for paper in papers:
    tokens  = nlp(paper)
    for sentence in tokens.sents:
        sen_len.append(len(sentence))
        paper_sen += 1
print('average number of words per sentence:', np.mean(sen_len))

average number of words per sentence: 27.087230287960214


In [19]:
print('total tweets sentences:', tweet_sen)
print('total blogs sentences:', blog_sen)
print('total news sentences:', news_sen)
print('total papers sentences:', paper_sen)

total tweets sentences: 16093
total blogs sentences: 26839
total news sentences: 19423
total papers sentences: 24934


In [28]:
tweet_x = np.zeros((tweet_sen, max_len, vec_size))
i = 0
z = 0
w = 0
for tweet in tweets:
    tokens  = nlp(tweet)
    for sentence in tokens.sents:
        if len(sentence) > 4:
            for j, word in enumerate(sentence):
                w += 1
                if np.sum(word.vector) == 0:
                    z += 1
#             if j < max_len:
#                 tweet_x[i][j] = word.vector
        
            i += 1
print(i / tweet_sen)
print(z / w)

1.534331697011123
0.02486538720908001


In [None]:
blog_x = np.zeros((blog_sen, max_len, vec_size))
i = 0
for blog in blogs:
    tokens  = nlp(blog)
    for sentence in tokens.sents:
        for j, word in enumerate(sentence):
            if j < max_len:
                blog_x[i][j] = word.vector
        
        i += 1

In [None]:
news_x = np.zeros((news_sen, max_len, vec_size))
i = 0
for new in news:
    tokens  = nlp(new)
    for sentence in tokens.sents:
        for j, word in enumerate(sentence):
            if j < max_len:
                news_x[i][j] = word.vector
        
        i += 1

In [None]:
tweet_y = np.repeat(0, blog_sen)
blog_y = np.repeat(1, blog_sen)
news_y = np.repeat(2, news_sen)

In [None]:
data = np.vstack([news_x, blog_x])
labels = np.hstack([news_y, blog_y])
labels = labels.reshape((-1, 1))

In [None]:
one_hot_labels = to_categorical(labels, num_classes=3)

In [None]:
train_x, test_x, train_y, test_y = train_test_split(data, one_hot_labels, test_size=0.1, random_state=42)
print('training size:', train_x.shape[0])
print('testing size:', test_x.shape[0])

In [None]:
model = Sequential()
model.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=(max_len, vec_size))) # returns a sequence of vectors of dimension 32
model.add(BatchNormalization())
model.add(Bidirectional(LSTM(128, return_sequences=True))) # returns a sequence of vectors of dimension 32
model.add(Dropout(0.5))
model.add(LSTM(256)) # return a single vector of dimension 32
model.add(BatchNormalization())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dense(3, activation='softmax'))

In [None]:
learning_rates = [0.001]
epochs = 3
batch_size = 64

grid = {}
i = 1

for learning_rate in learning_rates:
        
        print('\nTraining model with learning rate {} and decay {}...'.format(learning_rate, decay))

        optimizer = optimizers.Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=None, decay=decay, amsgrad=True)
        model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
        history = model.fit(train_x, train_y, batch_size=batch_size, epochs=epochs, validation_data=(test_x, test_y))

        grid['model{}_lr{}_dc{}'.format(i, learning_rate, decay)] = history
        
        # summarize history for accuracy
        print('\nAccuracy Plot')
        plt.plot(history.history['acc'])
        plt.plot(history.history['val_acc'])
        plt.title('model accuracy')
        plt.ylabel('accuracy')
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')
        plt.show()

        # summarize history for loss
        print('\nLoss Plot')
        plt.plot(history.history['loss'])
        plt.plot(history.history['val_loss'])
        plt.title('model loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')
        plt.show()

        i += 1

In [None]:
list(grid.keys())

In [None]:
grid['model1_lr0.001_dc0.0'].history['val_acc']

In [None]:
grid['model1_lr0.001_dc0.0'].keys()