# PARTI:Pretrained_word_embeddings Applied on TextClassifications

In [1]:
# This script loads pre-trained word embeddings (GloVe embeddings)
# into a frozen Keras Embedding layer, and uses it to 
# train a text classification model on the 20 Newsgroup dataset
# (classification of newsgroup messages into 20 different categories)

In [2]:
from __future__ import print_function

In [3]:
import os
import sys
import numpy as np

In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model

Using TensorFlow backend.


In [5]:
BASE_DIR = "."
GLOVE_DIR = BASE_DIR + "/data/glove/"
TEXT_DATA_DIR = BASE_DIR + "/data/20_newsgroups"
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

### First, build index mapping words in the embeddings set to their embedding vector

In [6]:
print("Indexing word vectors")

Indexing word vectors


In [7]:
embeddings_index = {}
f = open( os.path.join(GLOVE_DIR, "glove.6B.100d.txt") )
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray( values[1:], dtype='float32' )
    embeddings_index[word] = coefs
f.close()

In [8]:
print("Found %s word vectors." % len(embeddings_index) )

Found 400000 word vectors.


### second, prepare text samples and their labels

In [9]:
print("Processing text dataset")

Processing text dataset


In [10]:
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                if sys.version_info < (3,):
                    f = open(fpath)
                else:
                    f = open(fpath, encoding='latin-1')
                t = f.read()
                i = t.find('\n\n')  # skip header
                if 0 < i:
                    t = t[i:]
                texts.append(t)
                f.close()
                labels.append(label_id)

In [16]:
print( type(texts), len(texts) )
print( type(texts[0]) )
print( texts[0] )

<class 'list'> 19997
<class 'str'>


Archive-name: atheism/resources
Alt-atheism-archive-name: resources
Last-modified: 11 December 1992
Version: 1.0

                              Atheist Resources

                      Addresses of Atheist Organizations

                                     USA

FREEDOM FROM RELIGION FOUNDATION

Darwin fish bumper stickers and assorted other atheist paraphernalia are
available from the Freedom From Religion Foundation in the US.

Write to:  FFRF, P.O. Box 750, Madison, WI 53701.
Telephone: (608) 256-8900

EVOLUTION DESIGNS

Evolution Designs sell the "Darwin fish".  It's a fish symbol, like the ones
Christians stick on their cars, but with feet and the word "Darwin" written
inside.  The deluxe moulded 3D plastic fish is $4.95 postpaid in the US.

Write to:  Evolution Designs, 7119 Laurel Canyon #4, North Hollywood,
           CA 91605.

People in the San Francisco Bay area can get Darwin Fish from Lynn Gold --
try mailing <figmo@netcom.com>.  For ne

In [17]:
print('Found %s texts.' % len(texts))

Found 19997 texts.


### Finally, vectorize the text samples into a 2D integer tensor

In [18]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [27]:
print( type(sequences), len(sequences) )

<class 'list'> 19997


In [19]:
word_index = tokenizer.word_index

In [22]:
print( type(word_index), len(word_index) )
print('Found %s unique tokens.' % len(word_index))

<class 'dict'> 174074
Found 174074 unique tokens.


In [23]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [24]:
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (19997, 1000)
Shape of label tensor: (19997, 20)


In [28]:
# split the data into a training set and a validation set 
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

In [29]:
x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [30]:
print('Preparing embedding matrix.')

Preparing embedding matrix.


In [34]:
word_index

{'wx6': 158003,
 'l0h': 59314,
 'd5i': 118317,
 't5j': 59316,
 'userpath': 114161,
 'al1x': 75307,
 'ff2j33nn': 84469,
 'ds1489an': 153349,
 'yd0e9': 84471,
 'ozkq': 171430,
 'macavenue': 115389,
 'criticism': 4545,
 'm3zvm': 80243,
 'paisano': 84472,
 "'wr": 84473,
 'multjgv': 114162,
 'vehemence': 48170,
 'worldist': 29807,
 'mcloughlin': 137144,
 'coupons': 29808,
 'vaxb': 59318,
 'thumb': 6756,
 "5fge's": 84474,
 '93mar26205915': 141843,
 '13716': 48176,
 "dvorak's": 84476,
 'alphabetical': 26920,
 'realises': 87177,
 'gaxh': 84478,
 'billary': 35736,
 'piclab': 84480,
 "dawn's": 32536,
 'ablaze': 42276,
 "chevaldae's": 84483,
 'unnaturally': 48171,
 'resisting': 25993,
 "m'": 4429,
 'mo44': 131664,
 'skypix': 84485,
 'cocaine': 8323,
 'templar': 158999,
 'damper': 59323,
 'ordnance': 12524,
 'overbearingly': 48172,
 'msa1': 84487,
 '7po4w8sr': 84488,
 'enlightenment': 14183,
 'preregistered': 84489,
 '9473': 77706,
 'qandahar': 144231,
 '0192': 84492,
 '160012': 52918,
 '5eze': 84

In [31]:
# prepare embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [33]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False as to keep the embeddings fixed

In [38]:
embedding_layer = Embedding(
                            num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False
                            )

In [39]:
print("Training model...")

Training model...


In [40]:
# train a 2D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
embedding_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedding_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(embedding_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(embedding_sequences)
x = MaxPooling1D(35)(x)
x = Flatten()(x)
x = Dense(128, activation="relu")(x)
preds = Dense(len(labels_index), activation="softmax")(x)

In [41]:
# model complie
model = Model(sequence_input, preds)
model.compile(
             loss="categorical_crossentropy",
             optimizer="rmsprop",
             metrics=["acc"]
             )

In [43]:
model.fit(x_train, y_train,
          batch_size=128,
          epochs=50,
          validation_data=(x_val, y_val))

Train on 15998 samples, validate on 3999 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fe0ae8eb4e0>

In [2]:
# model.save("./data/pretrained_word_embeddings_textclassification.h5")

# PARTII:Pretrained word embeddings applied on Sentiment Analysis

In [1]:
# author - zpGao
# Jul, 2017
import numpy as np
import pandas as pd
import _pickle as cPickle
from collections import defaultdict
import re

In [2]:
from bs4 import BeautifulSoup

In [3]:
import sys
import os

In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


In [40]:
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPool1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional, MaxPooling1D
from keras.models import Model

In [6]:
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers

In [7]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [8]:
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased excpt 
    """
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()
    

In [9]:
data_train = pd.read_csv('./data/labeledTrainData.tsv', sep='\t')

In [10]:
data_train

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...
5,8196_8,1,I dont know why people think this is such a ba...
6,7166_2,0,"This movie could have been very good, but come..."
7,10633_1,0,I watched this video at a friend's house. I'm ...
8,319_1,0,"A friend of mine bought this film for £1, and ..."
9,8713_10,1,<br /><br />This movie is full of references. ...


In [11]:
print( type(data_train), data_train.shape )

<class 'pandas.core.frame.DataFrame'> (25000, 3)


In [12]:
texts = []
labels = []

In [13]:
test_text = BeautifulSoup(data_train.review[0], 'html.parser')
print( type(test_text) )
# print( test_text )
print( test_text.get_text().encode('ascii','ignore') )
print( clean_str(test_text.get_text()) )

<class 'bs4.BeautifulSoup'>
b"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.The actual feature film bit when it finall

In [14]:
for idx in range(data_train.review.shape[0]):
    text = BeautifulSoup(data_train.review[idx], "html.parser")
    texts.append(clean_str(text.get_text()))
    labels.append(data_train.sentiment[idx])
    

In [15]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [16]:
print( type(sequences), len(sequences) )
print( type( sequences[0]), len(sequences[0]), len(sequences[1])  )

<class 'list'> 25000
<class 'list'> 418 160


In [17]:
word_index = tokenizer.word_index

In [18]:
print( type(word_index), len(word_index) )
print( "Found %s unique tokens." % len(word_index))

<class 'dict'> 81506
Found 81506 unique tokens.


In [19]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [20]:
print( type(data), data.shape )

<class 'numpy.ndarray'> (25000, 1000)


In [21]:
labels = to_categorical(np.asarray(labels))

In [22]:
print( type(labels), labels.shape )

<class 'numpy.ndarray'> (25000, 2)


In [23]:
print("Shape of data tensor:", data.shape)
print("Sahpe of label tensor:", labels.shape)

Shape of data tensor: (25000, 1000)
Sahpe of label tensor: (25000, 2)


In [24]:
indices = np.arange(data.shape[0])

In [25]:
# indices
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

In [26]:
print( type(data), data.shape )
print( type(labels), labels.shape )

<class 'numpy.ndarray'> (25000, 1000)
<class 'numpy.ndarray'> (25000, 2)


In [27]:
nb_validation_samples = int( VALIDATION_SPLIT * data.shape[0] )

In [28]:
nb_validation_samples
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [29]:
y_train

array([[ 1.,  0.],
       [ 0.,  1.],
       [ 1.,  0.],
       ..., 
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.]])

In [30]:
print("Training and validation set number of positive and negative reviews:")
# print( y_train.sum(axis=1) )
print( y_train.sum(axis=0) )
# print( y_val.sum(axis=1) )
print( y_val.sum(axis=0) )

Training and validation set number of positive and negative reviews:
[  9940.  10060.]
[ 2560.  2440.]


In [31]:
GLOVE_DIR = "./data/glove/"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [32]:
print( "Total %s word vectors." % len(embeddings_index) )

Total 400000 word vectors.


In [33]:
print(type(word_index), len(word_index) )
print(EMBEDDING_DIM)

<class 'dict'> 81506
100


In [34]:
print("preparing embedding matrix...")

preparing embedding matrix...


In [35]:
# prepare embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

In [36]:
for word, i in word_index.items():
    if i>= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros
        embedding_matrix[i] = embedding_vector

In [37]:
# Load the pre-trained word embeddings into an Embedding layer
# note that we set trainable=False so as to keep the embeddings fixed
embedding_layer = Embedding(
                            num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False
                           )

In [38]:
print("Training model...")

Training model...


In [42]:
# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense( 2, activation='softmax' )(x)

In [44]:
# model compile
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 100)         2000000   
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 199, 128)          0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 35, 128)           82048     
__________

In [47]:
model.fit(x_train, y_train,
          batch_size=50,
          epochs=20,
          validation_data=(x_val, y_val))

Train on 20000 samples, validate on 5000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f644c471da0>

In [48]:
model.save("./data/pretrained_word_embeddings_sentimentanalysis.h5")