In [1]:
## Connect to google colab

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
## Import data reading libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
## Load data

amazon_data = pd.read_csv("/content/drive/MyDrive/Amazon/Amazon_100000_data/Cleaned_Amazon_data.csv")
amazon_data.head()

Unnamed: 0,Text,Score
0,product arrived labeled jumbo salted peanut pe...,1
1,cat happily eating felidae platinum two year g...,1
2,candy red flavor plan chewy would never buy,1
3,oatmeal good mushy soft like quaker oat way,1
4,arrived day stale could eat bag,1


In [5]:
## Taking only text data

text = amazon_data.Text

In [6]:
## create the tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()

In [8]:
## fit the tokenizer on the documents

tokenizer.fit_on_texts(text)

In [9]:
## summarize what was learned

print("word_counts: \n",tokenizer.word_counts)
print("\n\ndocument_count: \n",tokenizer.document_count)
print("\n\nword_index: \n",tokenizer.word_index)
print("\n\nword_docs: \n",tokenizer.word_docs)

word_counts: 


document_count: 
 200000


word_index: 


word_docs: 


In [10]:
## integer encode sequences

text_numeric = tokenizer.texts_to_sequences(text)
print(text_numeric[0])

[3, 252, 1471, 4771, 1731, 192, 192, 138, 93, 893, 3614, 89, 2231, 1156, 1688, 5713, 3, 4771]


In [16]:
## pad sequences with 0 values.
## here we using padding = 'pre', cause if two sentence after converting to numerical value, is not of same length , we are using zero before sentences and make them of equal length.
## if we put padding = 'post', then zero will add after sentences. 

from tensorflow.keras.preprocessing.sequence import pad_sequences

sent_length = max([len(i) for i in text_numeric])
pad_seqence_text = pad_sequences(text_numeric, maxlen=sent_length, padding = 'post')
print(pad_seqence_text)

[[   3  252 1471 ...    0    0    0]
 [  42 2093  108 ...    0    0    0]
 [ 219  400    5 ...    0    0    0]
 ...
 [   3   13 1755 ...    0    0    0]
 [ 170  177  627 ...    0    0    0]
 [  29  220   43 ...    0    0    0]]


In [23]:
import pickle

# saving
with open('/content/drive/MyDrive/Amazon/Amazon_100000_data/tokenizer_1.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# loading
with open('/content/drive/MyDrive/Amazon/Amazon_100000_data/tokenizer_1.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [24]:
import io, json

# saving
tokenizer_json = tokenizer.to_json()
with io.open('/content/drive/MyDrive/Amazon/Amazon_100000_data/tokenizer_1.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

# loading
from tensorflow.keras.preprocessing.text import tokenizer_from_json
with open('/content/drive/MyDrive/Amazon/Amazon_100000_data/tokenizer_1.json') as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)

In [27]:
## save data as NPZ file

np.savez_compressed("/content/drive/MyDrive/Amazon/Amazon_100000_data/Amazon_text_to_numeric.npz", a=np.array(pad_seqence_text), b=np.array(amazon_data.Score))

In [31]:
## Load data

loaded = np.load("/content/drive/MyDrive/Amazon/Amazon_100000_data/Amazon_text_to_numeric.npz")
text_numeric = loaded['a']
print(text_numeric)

[[   3  252 1471 ...    0    0    0]
 [  42 2093  108 ...    0    0    0]
 [ 219  400    5 ...    0    0    0]
 ...
 [   3   13 1755 ...    0    0    0]
 [ 170  177  627 ...    0    0    0]
 [  29  220   43 ...    0    0    0]]


In [32]:
score = loaded['b']
print(score)

[1 1 1 ... 5 5 5]
