-
Notifications
You must be signed in to change notification settings - Fork 0
/
vectorization.py
38 lines (29 loc) · 1.5 KB
/
vectorization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text
#from exploring_data import plot_sample_length_distribution
from loading_data import load_data
def sequence_vectorize(train_texts, val_texts):
TOP_K = 20000
# Limit on the length of text sequences. Sequences longer than this will be truncated.
MAX_SEQUENCE_LENGTH = 500
'''i/p ->
train_texts: list, training text strings.
test_texts: list, validation text strings.
0/p ->
x_train, x_val, word_index: vectorized training and validation
texts and word index dictionary.'''
#vocabulary
tokenizer = text.Tokenizer(num_words=TOP_K)
tokenizer.fit_on_texts(train_texts)
#vectorisation
x_train = tokenizer.texts_to_sequences(train_texts)
x_val = tokenizer.texts_to_sequences(val_texts)
max_length = len(max(x_train, key=len)) # if the text is more 500 words it gets truncated to 500 words
if max_length > MAX_SEQUENCE_LENGTH:
max_length = MAX_SEQUENCE_LENGTH
x_train = sequence.pad_sequences(x_train, maxlen=max_length) # if the length of the text is less than 500 the sequence is added with more words
x_val = sequence.pad_sequences(x_val, maxlen=max_length)
return x_train, x_val, tokenizer.word_index
columns = ['0','category','title','desc']
((train_text,train_labels),(test_text,test_labels)) = load_data("/Users/archana/web_classification/test.csv", columns)
print(sequence_vectorize(train_text, test_text))