In [1]:
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
numpy.random.seed(7)
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
# create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, epochs=3, batch_size=64)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3


KeyboardInterrupt: 

In [3]:
print (imdb.load_data().head())

AttributeError: 'tuple' object has no attribute 'head'

In [1]:
# Initialize paramters
numDimensions = 300
batchSize = 64
lstmUnits = 128
nLayers = 2
numClasses = 2
iterations = 30000

In [7]:
# Multilayer Perceptron
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense

visible = Input(shape=(100,1))
hidden1 = LSTM(10)(visible)
hidden2 = Dense(10, activation='relu')(hidden1)
output = Dense(1, activation='sigmoid')(hidden2)
model = Model(inputs=visible, outputs=output)
# summarize layers
model.summary()
# plot graph
plot_model(model, to_file='multilayer_perceptron_graph.png')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 100, 1)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 10)                480       
_________________________________________________________________
dense_14 (Dense)             (None, 10)                110       
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 11        
Total params: 601
Trainable params: 601
Non-trainable params: 0
_________________________________________________________________


ImportError: Failed to import `pydot`. Please install `pydot`. For example with `pip install pydot`.

In [4]:
import os
from os import listdir
from os.path import isfile, join
import numpy as np
ids = np.load(os.path.join('I:/NLP/data_train/','idsMatrixques-2.npy'))
#ids = np.load(f'{currentDir}idsMatrix.npy')
print('Word indexes of the first review: ', ids[2])

Word indexes of the first review:  [ 10247  10452  10110 122254  10005  10132  10149  94194 263461  10055
  10047 374498  11029  10057  10607  39066  10092   9999  10405  10076
  10040      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0

In [2]:
import numpy as np
import os
currentDir = 'I:/NLP/data_train/'

wordsList = np.load(os.path.join(currentDir, 'wordsList.npy'))
#wordsList = np.load(f'{currentDir}wordsList.npy')

print('Simplified vocabulary loaded!')
wordsList = wordsList.tolist()
#wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8

wordVectors = np.load(os.path.join(currentDir, 'wordVectors.npy'))
#wordVectors = np.load(f'{currentDir}wordVectors.npy')

wordVectors = np.float32(wordVectors)
print ('Word embedding matrix loaded!')
ngon_idx = wordsList.index('bán')
print('Index of `ngon` in wordsList: ', ngon_idx)
ngon_vec = wordVectors[ngon_idx]
print('Vector representation of `ngon` is: ', ngon_vec)

Simplified vocabulary loaded!
Word embedding matrix loaded!
Index of `ngon` in wordsList:  1906
Vector representation of `ngon` is:  [-1.365e-01 -5.560e-02  1.195e-01  4.790e-02 -8.640e-02 -2.430e-02
 -3.130e-02 -4.500e-02  4.340e-02 -1.074e-01 -9.460e-02 -5.220e-02
 -9.380e-02 -4.480e-02 -1.220e-02 -6.000e-03  3.570e-02 -5.500e-02
 -6.990e-02  6.200e-03  9.800e-03 -4.970e-02  8.430e-02  9.160e-02
  1.402e-01 -3.240e-02  3.970e-02 -8.270e-02 -3.770e-02 -7.240e-02
  2.710e-02  9.450e-02 -9.180e-02 -1.171e-01  6.750e-02 -8.980e-02
  1.854e-01 -1.097e-01 -1.230e-02  2.870e-02 -1.274e-01  5.160e-02
 -3.800e-03 -6.360e-02  4.900e-03  6.400e-02  1.466e-01 -4.950e-02
  9.050e-02 -1.720e-02  1.684e-01  1.850e-02 -6.220e-02 -6.320e-02
 -9.450e-02 -7.700e-02  1.410e-02 -5.110e-02 -2.300e-03  7.680e-02
 -5.090e-02 -9.790e-02  2.660e-02  1.153e-01  3.750e-02  1.185e-01
  4.650e-02  6.500e-03  1.913e-01  7.500e-02  6.960e-02 -6.130e-02
 -5.210e-02 -5.870e-02  5.530e-02  4.530e-02  3.770e-02 -9.500e

In [4]:
maxSeqLength = 180

numDimensions = 300
batchSize = 64
lstmUnits = 128
nLayers = 2
numClasses = 2
iterations = 30000

In [3]:
from random import randint
def getTrainBatch():
    labels = []
    arr = np.zeros((batchSize, maxSeqLength))
    for i in range(batchSize):
            # Pick positive samples randomly
        num = randint(1, 14000)
        labels.append([1, 0])
       
    return arr, labels

In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re


Using TensorFlow backend.


In [5]:
#data = open('I:/NLP/data_train/xuly.txt').readline()
# Keeping only the neccessary columns
data = [[]]
with open('I:/NLP/data_train/xuly.txt', "r", encoding='utf-8') as f:
    for line in f.readlines():
        data.append([line, 1])       
print('Positive files finished')
#data = [[open('I:/NLP/data_train/xuly.txt').readline(),1]]
print (data[1][0])

Positive files finished
 Các bạn IU cho mình hỏi là sau khi làm xong thesis ở học kì 2 năm 4 thì mình còn học kì 3 rãnh. Liệu mình có thể đkmh để cải thiện điểm ko? Và có thể update điểm kịp để xét tốt nghiệp vào tháng 9 không nhỉ? _



In [7]:
#data = data[data.sentiment != "Neutral"]
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

print(data[ data['sentiment'] == 'Positive'].size)
print(data[ data['sentiment'] == 'Negative'].size)

for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')
    
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

TypeError: list indices must be integers or slices, not str

In [9]:
import numpy as np
#embed_dim = 128
#lstm_out = 196
x =  np.load(f'I:/NLP/data_train/idsMatrixques-2.npy')
print (len(x))
#X = open('I:/NLP/data_train/idsMatrixques.npy', "r", encoding='utf-8')
print (x[2999])

9448
[  10181  314185  314185   10181   11505   11066  350794   10065   10157
   10059   10194   10013  314185   10213   10213   10213 1264310   10005
   10011   10013   10019   10038  275576   10210   10210   10030   10027
   10744   10065  604258   10011   10011   10023   10545   10196   10400
   10521   10637  368564   10011  811496   10023   10023   15980   13438
   12983   18975  140453  611633   10062   10010  164768  701607   10017
   13364   10005   12916   10028   10793   10101   10393       0       0
       0       0       0       0       0       0       0       0       0
       0       0       0       0       0       0       0       0       0
       0       0       0       0       0       0       0       0       0
       0       0       0       0       0       0       0       0       0
       0       0       0       0       0       0       0       0       0
       0       0       0       0       0       0       0       0       0
       0       0       0       0       0      

In [4]:
for i in range (1,10):
    print (i)

1
2
3
4
5
6
7
8
9


In [10]:
data = []
target = []
x_test = []
y_test = []
for i in range (1,1831):
    data.append(x[i])
    target.append(1)

for i in range (1831,7553):
    data.append(x[i])
    target.append(0)
    
for i in range (1001,1199):
    x_test.append(x[i])
    y_test.append(1)
for i in range (2003,2267):
    x_test.append(x[i])
    y_test.append(0)

In [11]:
print (data[1])
print (target[1831])
# target = np.array(target)
# target = target.reshape(-1, 1)

[ 10247  10452  10110 122254  10005  10132  10149  94194 263461  10055
  10047 374498  11029  10057  10607  39066  10092   9999  10405  10076
  10040      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      

In [39]:
def get_weight(ids):
    return wordVectors[ids]   

embedding_vectors = get_weight_matrix(raw_embedding, tokenizer.word_index)
embedding_vectors = wordVectors[idx]
# create the embedding layer
embedding_layer = Embedding(vocab_size, 300, weights=[embedding_vectors], input_length=max_length, trainable=False)
 

NameError: name 'get_weight_matrix' is not defined

In [9]:
wordVectors = np.load(f'I:/NLP/data_train/wordVectors.npy')

In [1]:
import numpy as np

fileroad = 'E:/cc.vi.300.vec' 
tmp = np.array

tuw = []
#y = 0 
with open(fileroad, encoding="utf8") as fp:
    for x in range(2000000):
        line = fp.readline().strip()
        #while line:
        z = line.split(' ', 1)[0]
        #if x == 19970:
        if z == 'cám':
            print ('index is: ' + str(x))
            print (len(line.split()[1:]))
            tmp = np.array(line.split()[1:], dtype='float32')
            print(tmp)
            # print (line.split(' ', 1)[1:])
            break

index is: 3777
300
[ 2.529e-01  1.483e-01  8.210e-02 -4.320e-02 -1.104e-01  1.000e-04
 -9.490e-02 -5.620e-02  1.540e-02 -4.410e-02 -7.520e-02 -7.640e-02
 -1.490e-01  6.840e-02 -2.175e-01  9.160e-02 -5.800e-02  7.780e-02
 -6.300e-02 -7.510e-02 -1.332e-01  1.229e-01 -2.330e-02  1.745e-01
  6.930e-02  9.900e-03 -6.200e-03 -2.029e-01  3.051e-01  1.452e-01
  1.280e-02 -8.600e-02  8.600e-02 -2.049e-01  5.670e-02  3.400e-02
  7.320e-02  1.021e-01  2.271e-01 -6.200e-03  6.680e-02 -4.400e-02
 -3.550e-02 -1.461e-01 -1.370e-02  9.480e-02 -8.350e-02 -1.979e-01
 -1.392e-01  7.600e-03 -1.030e-02  3.234e-01 -7.090e-02  3.067e-01
 -1.561e-01 -2.104e-01 -4.100e-03 -1.005e-01 -2.084e-01 -9.150e-02
  1.850e-02 -8.550e-02 -6.080e-02 -5.290e-02 -2.180e-02  8.650e-02
  4.920e-02 -1.770e-02 -2.360e-02 -7.200e-03  2.890e-02 -1.548e-01
  1.010e-01  1.579e-01 -1.187e-01  1.853e-01  1.319e-01  1.792e-01
 -4.740e-02 -6.680e-02 -1.988e-01  1.730e-02  7.040e-02 -2.090e-01
  2.990e-02  1.351e-01  1.995e-01 -1.092e-0

In [16]:
from keras.layers import Embedding

model = Sequential()  
# model.add(LSTM(100, input_shape=(1, 100),return_sequences=True))
# model.add(Dense(100))
# model.compile(loss='mean_absolute_error', optimizer='adam',metrics=['accuracy'])
# model.fit(data, target, nb_epoch=10, batch_size=20, verbose=2,validation_data=(x_test, y_test))

max_fatures = 2000
embed_dim = 128
lstm_out = 196 
# model.add(Embedding(max_fatures, embed_dim,input_length = 185))
# model.add(SpatialDropout1D(0.4))
# model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
# model.add(Dense(1,activation='softmax'))
# model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
# model.fit(data, target, nb_epoch=10, batch_size=20, verbose=2,validation_data=(x_test, y_test))


model.add(Embedding(len(wordVectors), 300, weights=[wordVectors], input_length=180, trainable=False))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


#model.add(Embedding(len(wordVectors), 300, weights=[wordVectors], input_length=180, trainable=False))
#model.add(SpatialDropout1D(0.4))
#model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
#model.add(Dense(1, activation='sigmoid'))
#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# model.add(SpatialDropout1D(0.4))
# model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
# model.add(Dense(1,activation='softmax'))
# model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])


print(model.summary())


model.fit(np.array(data), np.array(target), nb_epoch=60, batch_size=34, verbose=1)   #, validation_data=(np.array(x_test), np.array(y_test) ))
# predict = model.predict(data)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 180, 300)          600000000 
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 600,219,777
Trainable params: 219,777
Non-trainable params: 600,000,000
_________________________________________________________________
None




Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


<keras.callbacks.History at 0x14084edcc88>

In [17]:
model.save_weights('I:/NLP/data_train/train/quesmodel-weight-gach.h')

# Save the model architecture
with open('I:/NLP/data_train/train/quesmodel_architecture-gach.json', 'w') as f:
    f.write(model.to_json())
#model.save('I:/NLP/data_train/train/quesmodel-1.h5')

In [16]:
import sys
import numpy as np
import os
import re
from keras.models import load_model

#import sentimential as st
from keras.models import model_from_json

# Model reconstruction from JSON file
with open('I:/NLP/data_train/train/quesmodel_architecture.json', 'r') as f:
    model = model_from_json(f.read())

# Load weights into the new model
model.load_weights('I:/NLP/data_train/train/quesmodel-weight-1.h')

In [19]:
for layer in model.layers:
    for weight in layer.weights:
        print(f'{weight.shape}\t{weight.name}')

(19899, 300)	embedding_3_1/embeddings:0
(300, 512)	lstm_3_1/kernel:0
(128, 512)	lstm_3_1/recurrent_kernel:0
(512,)	lstm_3_1/bias:0
(128, 1)	dense_3_1/kernel:0
(1,)	dense_3_1/bias:0


In [18]:
import re
strip_special_chars = re.compile('[^\w0-9 ]+')

def cleanSentences(string):
    string = string.lower().replace("<br />", " ")
    return re.sub(strip_special_chars, "", string.lower())

In [39]:
np.array(x_test).shape

(462, 180)

In [1]:
from keras.models import load_model

model = load_model('I:/NLP/data_train/train/quesmodel-1.h5')

Using TensorFlow backend.


In [22]:
with open('E:/cc.vi.300.vec', encoding = "utf8") as fp:
    lines = fp.readlines()

In [32]:
f = ['Các bạn IU cho mình hỏi là sau khi làm xong thesis ở học kì 2 năm 4 thì mình còn học kì 3 rãnh. Liệu mình có thể đkmh để cải thiện điểm ko?']
ids = []
maxSeqLength = 180
line = f[0]
split = cleanSentences(line).split()

for i, word in enumerate(split):
    if i >= maxSeqLength:
        break
    try:
        idx = 0
        for x in range(2000000):
            #line = fp.readline().strip()
            doline = lines [x] 
            #while line:
            z = doline.split(' ', 1)[0]
            #if x == 200000:
            if z == word:
                idx = x + 9998
                break
        #idx = wordsList.index(split[word_idx])
        ids.append(idx)
        #ids.append(wordsList.index(word))
    except ValueError:
        ids.append(518835)
        
ids = np.array(ids + ([0] * (maxSeqLength - len(ids))))
ids = np.expand_dims(ids, axis=0)
prediction = model.predict(ids)
print (ids)
print(prediction)
if prediction < 0.5: 
    print ('cmt')
else: 
    print ('ques')

[[ 10014  10038  14875  10015  10065  10391  10006  10061  10030  10045
   10592  48801  10024  10170  12682  10048  10018  10098  10055  10065
   10062  10170  12682  10069  13876  11245  10065  10010  11280 786841
   10026  12738  13054  10221  10547      0      0      0      0      0
       0      0      0      0      0      0      0      0      0      0
       0      0      0      0      0      0      0      0      0      0
       0      0      0      0      0      0      0      0      0      0
       0      0      0      0      0      0      0      0      0      0
       0      0      0      0      0      0      0      0      0      0
       0      0      0      0      0      0      0      0      0      0
       0      0      0      0      0      0      0      0      0      0
       0      0      0      0      0      0      0      0      0      0
       0      0      0      0      0      0      0      0      0      0
       0      0      0      0      0      0      0      0      0

In [23]:
f = ['Anh_chị cho e hỏi . Chiều_qua e học ở VNU có thấy 1 chị khóa trên mặc sơ_mi trắng cột tóc cao cực xinh']
ids = []
maxSeqLength = 180
line = f[0]
split = cleanSentences(line).split()

for i, word in enumerate(split):
    if i >= maxSeqLength:
        break
    try:
        ids.append(wordsList.index(word))
    except ValueError:
        ids.append(0)
ids = np.array(ids + ([0] * (maxSeqLength - len(ids))))
ids = np.expand_dims(ids, axis=0)
prediction = model.predict(ids)
print(prediction)

[[3.8111539e-06]]


In [63]:
f = ['Anh_chị cho e hỏi . Chiều_qua e học ở VNU có thấy 1 chị khóa trên mặc sơ_mi trắng cột tóc cao cực xinh']
inputu = []
ids = np.zeros((1, 180), dtype=np.int32)
maxSeqLength = 180
line = f[0]
cleanedLine = cleanSentences(line)
split = cleanedLine.split()
print (len(split))
print (split)
for word_idx in range(len(split)):
    if word_idx >= maxSeqLength:
        break
    try:
        idx = wordsList.index(split[word_idx])
        #inputu.append(idx)
        ids[word_idx] = idx 
    except ValueError:
        ids[word_idx] = unk_idx

inputu = np.array(inputu)
print (inputu)
# prediction = model.predict(x_test)
print(prediction)

23
['anh_chị', 'cho', 'e', 'hỏi', 'chiều_qua', 'e', 'học', 'ở', 'vnu', 'có', 'thấy', '1', 'chị', 'khóa', 'trên', 'mặc', 'sơ_mi', 'trắng', 'cột', 'tóc', 'cao', 'cực', 'xinh']


IndexError: index 1 is out of bounds for axis 0 with size 1

In [10]:
from os import listdir
from os.path import isfile, join
numWords = []
qf = len(open('I:/NLP/data_train/xulykogach.txt',  encoding="utf8").readlines())
cf = len(open('I:/NLP/data_train/xuly2kogach.txt',  encoding="utf8").readlines())
with open('I:/NLP/data_train/xulykogach.txt', "r", encoding='utf-8') as f:
    for line in f.readlines():
        counter = len(line.split())
        numWords.append(counter)       
print('Positive files finished')

with open('I:/NLP/data_train/xuly2kogach.txt', "r", encoding='utf-8') as f:
    for nf in f.readlines():
        line=f.readline()
        counter = len(line.split())
        numWords.append(counter)  
print('Negative files finished')


numFiles = len(numWords)
print('The total number of line is', numFiles)
print('The total number of words in the files is', sum(numWords))
print('The average number of words in the files is', sum(numWords)/len(numWords))

Positive files finished
Negative files finished
The total number of line is 9448
The total number of words in the files is 49825
The average number of words in the files is 5.273602878916173


In [2]:
import re
strip_special_chars = re.compile('[^\w0-9 ]+')

def cleanSentences(string):
    string = string.lower().replace("<br />", " ")
    return re.sub(strip_special_chars, "", string.lower())

In [3]:
import h5py

train = h5py.File('I:/NLP/data_train/cc.vi.300.hdf5', 'w')
vectors = train.create_dataset('vectors', (2000000, 300), dtype='float')
dt = h5py.special_dtype(vlen=str)
words = train.create_dataset('words', (2000000,), dtype=dt)

In [2]:
train.close()

In [4]:
import numpy as np
buf_size = 10000
idx = 0
vector_buf, word_buf = [], []
with open('E:/cc.vi.300.vec', encoding='utf-8') as f:
    line = f.readline().strip()
    for i in range(2000000):
        tmp = f.readline().strip().split()
        vector_buf.append(np.array(tmp[1:], dtype='float32'))
        word_buf.append(tmp[0])
        if i%buf_size == 0:
            next_idx = idx + buf_size
            vectors[idx:next_idx] = vector_buf
            words[idx:next_idx] = word_buf
            vector_buf, word_buf = [], []
            idx = next_idx
            
    train.close()

In [13]:
import h5py
t_train = h5py.File('I:/NLP/data_train/cc.vi.300.hdf5')
print(t_train['words'][100010])
t_train['vectors'][0].shape
print(t_train['vectors'][99990])

THẠNH
[ 2.22999994e-02  2.69000009e-02  1.89999994e-02  5.29999994e-02
  1.59000009e-02 -3.62999998e-02 -2.04000007e-02 -8.99999985e-04
  9.60000046e-03 -3.92999984e-02  9.99999978e-03 -1.02000004e-02
  2.08999999e-02  1.82000007e-02 -1.78999994e-02 -2.06000004e-02
 -2.91000009e-02 -1.71000008e-02 -1.95000004e-02 -1.47000002e-02
 -2.05000006e-02  4.45000008e-02 -2.08999999e-02  1.44999996e-02
 -1.41000003e-02  5.49999997e-03 -4.49999981e-03 -9.49999969e-03
  3.80000006e-03 -1.26000000e-02 -1.20999999e-02  1.00000005e-03
  2.82000005e-02 -6.23000003e-02 -1.26999998e-02 -2.89999996e-03
 -2.34999992e-02  1.52000003e-02  1.85000002e-02 -2.07000002e-02
  2.49999994e-03  2.84000002e-02  1.40300006e-01 -3.68999988e-02
  1.35000004e-02 -4.49000001e-02 -9.70000029e-03  8.00000038e-03
 -3.00000003e-03  2.87999995e-02 -9.70000029e-03 -2.60000001e-03
 -5.35999984e-02  1.78999994e-02  6.99999975e-04 -9.53999981e-02
 -3.50000001e-02 -3.04000005e-02  1.15000000e-02 -7.60000013e-03
 -4.10000002e-03 -2

In [15]:
import numpy as np
import os
# wordsList = np.load(os.path.join(currentDir, 'wordsList.npy'))
wordsList = t_train['words']

print('Simplified vocabulary loaded!')
#wordsList = wordsList.tolist()
#wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8

# wordVectors = np.load(os.path.join(currentDir, 'wordVectors.npy'))
wordVectors = t_train['vectors']

wordVectors = np.float32(wordVectors)
print ('Word embedding matrix loaded!')

Simplified vocabulary loaded!
Word embedding matrix loaded!


In [None]:
import numpy as np

fileroad = 'E:/cc.vi.300.vec' 
tmp = np.array

tuw = []
#y = 0 
with open(fileroad, encoding="utf8") as fp:
    for x in range(2000000):
        line = fp.readline().strip()
        #while line:
        z = line.split(' ', 1)[0]
        #if x == 19970:
        if z == 'cám':
            print ('index is: ' + str(x))
            print (len(line.split()[1:]))
            tmp = np.array(line.split()[1:], dtype='float32')
            print(tmp)
            # print (line.split(' ', 1)[1:])
            break

In [11]:
import numpy as np
maxSeqLength = 180

import re
strip_special_chars = re.compile('[^\w0-9 ]+')

def cleanSentences(string):
    string = string.lower().replace("<br />", " ")
    return re.sub(strip_special_chars, "", string.lower())


fileroad = 'E:/cc.vi.300.vec'
ids = np.zeros((numFiles, maxSeqLength), dtype=np.int32)
nFiles = 0
unk_idx = 518835
cmtFiles = open('I:/NLP/data_train/xuly2kogach.txt', encoding='utf-8')
questionFiles = open('I:/NLP/data_train/xulykogach.txt', encoding='utf-8')
count = len(questionFiles.readlines())
count2 = len(cmtFiles.readlines())
print (count)
with open(fileroad, encoding="utf8") as fp:
    lines = fp.readlines()
    with open('I:/NLP/data_train/xulykogach.txt', 'r', encoding='utf-8') as f:
        for qf_idx in range(count):
            line = f.readline()
            #cleanedLine = cleanSentences(line)
            split = line.split()
            #print (len(split))
            for word_idx in range(len(split)):
                if word_idx >= maxSeqLength:
                    break
                try:
                    for x in range(2000000):
                        #line = fp.readline().strip()
                        doline = lines [x] 
                        #while line:
                        z = doline.split(' ', 1)[0]
                        #if x == 200000:
                        if z == split[word_idx]:
                            idx = x + 9998
                            #print ('index is: ' + str(x))
                            #print (line)
                            #print (line.split(' ', 1)[1])
                            break
                    #idx = wordsList.index(split[word_idx])
                    ids[qf_idx, word_idx] = idx
                except ValueError:
                    ids[qf_idx, word_idx] = unk_idx
            #print (line)

    print (count2)
    print (count+count2)
    with open('I:/NLP/data_train/xuly2kogach.txt', 'r', encoding='utf-8') as f:
        for cf_idx in range(count, count+count2-2):
            line = f.readline()
            cleanedLine = cleanSentences(line)
            split = cleanedLine.split()
            for word_idx in range(len(split)):
                if word_idx >= maxSeqLength:
                    break
                try:
                    for x in range(2000000):
                        doline = lines[x]
                        z = doline.split(' ', 1)[0]
                        if z == split[word_idx]:
                            idx = x + 9998
                            break
                    #idx = wordsList.index(split[word_idx])
                    ids[cf_idx, word_idx] = idx
                except ValueError:
                    ids[cf_idx, word_idx] = unk_idx
                
np.save('I:/NLP/data_train/idsMatrixques-2.npy', ids)

1896
7552
9448


In [48]:
import numpy as np
fileroad ='E:\cc.vi.300.vec'
filepath ="I:/NLP/data_train/tu.npy"
tuw = []
y = 0 
with open(fileroad) as fp:
    with open(filepath, 'wb') as tu:
        for x in range(2000000):
            line = fp.readline()
            #while line:
            z = line.split(' ', 1)[0]
            if z == 'ngon':
                print (x)
                break
            
            
            
            #y += 1
            #if y % 5 == 0:
                #np.save(tu, tuw)
                #y = 0
                #tuw = []
            #print("Line {}: {}".format(x, line.split(' ', 1)[0]))
            #line = fp.readline()
        
# with open(filename, 'rb') as fff:
#      print(fff.readlines())  

#wordsList = np.load(f'/Users/mac/Desktop/NLP/data_train/tu.npy')
#print('Simplified vocabulary loaded!')
#wordsList = wordsList.tolist()
#print('Size of the vocabulary: ', len(wordsList)) 
#print (wordsList)
        
    

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 4136: character maps to <undefined>

In [46]:
f.close()

In [43]:
import tables
import numpy as np

filename = 'outarray.h5'
ROW_SIZE = 100
NUM_COLUMNS = 2000

f = tables.open_file(filename, mode='w')
atom = tables.Float64Atom()

array_c = f.create_earray(f.root, 'data', atom, (0, ROW_SIZE))

for idx in range(NUM_COLUMNS):
    x = np.random.rand(1, ROW_SIZE)
    array_c.append(x)
f.close()

In [44]:
f = tables.open_file(filename, mode='a')
f.root.data.append(x)

In [45]:
f = tables.open_file(filename, mode='r')
print(f.root.data[1:10,2:20]

SyntaxError: unexpected EOF while parsing (<ipython-input-45-96c1f28f7273>, line 2)

In [67]:
prediction = model.predict(np.array(x_test))
print(prediction)

[[0.71101844]
 [0.7659625 ]
 [0.6447913 ]
 [0.71104187]
 [0.76595825]
 [0.7110288 ]
 [0.0194401 ]
 [0.76595813]
 [0.76596576]
 [0.67143285]
 [0.71099645]
 [0.64479125]
 [0.64479136]
 [0.7659696 ]
 [0.3881391 ]
 [0.71102977]
 [0.7110287 ]
 [0.765958  ]
 [0.2981338 ]
 [0.71102875]
 [0.39108372]
 [0.7110285 ]
 [0.71102786]
 [0.7109137 ]
 [0.71102864]
 [0.7110284 ]
 [0.3830236 ]
 [0.7537285 ]
 [0.6448196 ]
 [0.76596457]
 [0.71102756]
 [0.38088366]
 [0.7168713 ]
 [0.64479214]
 [0.7108114 ]
 [0.73556507]
 [0.64479125]
 [0.6725571 ]
 [0.7565845 ]
 [0.74750835]
 [0.6447913 ]
 [0.3809575 ]
 [0.7652571 ]
 [0.7659657 ]
 [0.64479125]
 [0.7659617 ]
 [0.7658632 ]
 [0.64479136]
 [0.71102804]
 [0.76595896]
 [0.7659613 ]
 [0.71101564]
 [0.7280378 ]
 [0.7110273 ]
 [0.7659293 ]
 [0.64479154]
 [0.7109196 ]
 [0.727397  ]
 [0.64479125]
 [0.7110287 ]
 [0.5237914 ]
 [0.75696695]
 [0.7655316 ]
 [0.76596934]
 [0.7108184 ]
 [0.64618635]
 [0.6447914 ]
 [0.63785267]
 [0.3818133 ]
 [0.64480066]
 [0.71582514]
 [0.64

In [None]:
ids = np.zeros((1, 180), dtype=np.int32)
ids[1] = 

array([1, 1, 1, ..., 1, 1, 1])

In [None]:
,validation_data=(np.array(x_test), np.array(y_test))