# Bag of words: GloVe

In [1]:
path1 = 'text-formality-classifier/data/formal.txt'
with open(path1) as f:
    corpus_formal = f.readlines()
print("fertig")

path2 = 'text-formality-classifier/data/informal.txt'
with open(path2) as f:
    corpus_informal = f.readlines()
print("fertig")
#%%
from sklearn.utils import shuffle
import numpy as np

X = np.array(corpus_formal+corpus_informal)
y = np.array([0]*len(corpus_formal) + [1]*len(corpus_informal) )

X, y = shuffle(X,y)

from sklearn.model_selection import train_test_split

X_t, X_te, y_t, y_te = train_test_split(
                X, y, test_size=0.2, random_state=0)
#%%'
all_train = dict(classes=[0, 1], #formal 0, informal 1
                data = X_t,
                categories=np.array(y_t))

all_test = dict(classes=[0, 1], #formal 0, informal 1
                data = X_te,
                categories=np.array(y_te))

fertig
fertig


In [2]:
data_train,data_test = all_train['data'], all_test['data'] #list of strings
label_train, label_test =all_train['categories'], all_test['categories'] #array
print('Data prepared ：）')
print()

Data prepared ：）



In [7]:
data_train.shape

(84152,)

In [8]:
label_train.shape

(84152,)

In [6]:
data_test.shape

(21038,)

In [10]:
print('Indexing word vectors...')

words_index = {}
f = open('glove.6B.100d.txt',encoding='utf-8')
for line in f:
    word_vector = line.split()
    word = word_vector[0]
    vector = np.asarray(word_vector[1:], dtype='float32')
    words_index[word] = vector
f.close()
 
print('%s word vectors prepared ：）'%len(words_index)) #400000

Indexing word vectors...
400000 word vectors prepared ：）


In [12]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Activation
# import warnings simplefilter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [13]:
#features

print('Tokenizing...')

MAX_NUM_WORDS = 20000
tokenizer = Tokenizer(num_words = MAX_NUM_WORDS)

tokenizer.fit_on_texts(data_train)
sequences = tokenizer.texts_to_sequences(data_train)
tokenizer.fit_on_texts(data_test)
sequences_test = tokenizer.texts_to_sequences(data_test)

word_index = tokenizer.word_index 
print('Found %s unique tokens.'%len(word_index))


Tokenizing...
Found 33067 unique tokens.


In [14]:
#preparing training data

MAX_SEQUENCE_LENGTH = 1000

X_train = pad_sequences(sequences, maxlen = MAX_SEQUENCE_LENGTH)#长度不足1000的用0填充(前端填充)
X_test = pad_sequences(sequences_test, maxlen = MAX_SEQUENCE_LENGTH) 


y_train = to_categorical(label_train) 
y_test = to_categorical(label_test)

print('shape of training data',X_train.shape)
print('shape of training labels',y_train.shape)
print('shape of testing data',X_test.shape)
print('shape of testing labels',y_test.shape)


shape of training data (84152, 1000)
shape of training labels (84152, 2)
shape of testing data (21038, 1000)
shape of testing labels (21038, 2)


In [15]:
# split the training data for fun or simply use the paremeter in fit(validation_split=0.2)
index = np.arange(X_train.shape[0])
np.random.shuffle(index)
X_train = X_train[index]
y_train = y_train[index]
num_validation_samples = int(0.2*X_train.shape[0])
print('split %d validation samples '%num_validation_samples)

split 16830 validation samples 


In [17]:
X_train_split = X_train[:-num_validation_samples]
y_train_split = y_train[:-num_validation_samples]
X_train_val = X_train[-num_validation_samples:]
y_train_val = y_train[-num_validation_samples:]

print('shape of real training data',X_train_split.shape)
print('shape of real training labels',y_train_split.shape)
print('shape of validatation training data',X_train_val.shape)
print('shape of validatation training labels',y_train_val.shape)

shape of real training data (67322, 1000)
shape of real training labels (67322, 2)
shape of validatation training data (16830, 1000)
shape of validatation training labels (16830, 2)


In [19]:
#embedding matrix

EMBEDDING_DIM = 100
num_words = min(MAX_NUM_WORDS,len(word_index))
embedding_matrix = np.zeros((num_words +1,EMBEDDING_DIM))
for word,i in word_index.items():
    if i>MAX_NUM_WORDS:
        continue
    embedding_vector = words_index.get(word) #array
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector        
print('shape of embedding matrix:',embedding_matrix.shape)

shape of embedding matrix: (20001, 100)


In [21]:
#model LSTM

embedding_layer = Embedding(num_words + 1, 
                            EMBEDDING_DIM,
                            weights=[embedding_matrix], 
                            input_length=MAX_SEQUENCE_LENGTH, 
              )
print('Building model...')

model = Sequential() 
model.add(embedding_layer)
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))  #100维
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.add(Dense(len(all_train['classes']), activation='softmax'))
model.layers[1].trainable=False

print('Model completed ：）')
model.summary()

Building model...
Model completed ：）
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1000, 100)         2000100   
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
_________________________________________________________________
activation_2 (Activation)    (None, 1)                 0         
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 4         
Total params: 2,080,605
Trainable params: 2,000,205
Non-trainable params: 80,400
_________________________________________________________________


In [22]:
#compile
model.compile(
            optimizer='adam',#优化器
            loss='binary_crossentropy',#损失函数
            metrics=['accuracy'],#指标列表
            )

In [26]:
print('Training...')

batch_size = 1000
model.fit(X_train_split, y_train_split, batch_size=batch_size, epochs=5, validation_data=(X_train_val,y_train_val))

loss, acc = model.evaluate(X_test, y_test, batch_size=batch_size)

print('Loss:',loss) 
print('Accuracy:',acc) 

Training...
Train on 67322 samples, validate on 16830 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss: 0.6835604953963059
Accuracy: 0.6002947026666257


In [31]:
from keras.utils import plot_model

plot_model(model, to_file='model.png')
print('plotted!')

plotted!
