## Build Sequence Processor

In [0]:
import nltk
import string
from string import punctuation
from keras.preprocessing.text import Tokenizer
nltk.download('punkt')
import collections
import pickle

Using TensorFlow backend.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Class Functions:

- init(): Sets the directory paths for the images and the captions, Inilialize the VGG model.
- getTrainImageNames(): Creates a list of all the image names.
- preprocessData: Clean the data. Lowercase the captions, remove punctuations and keep data alphanumeric.
- prepareData(): Method for reading descriptions and adding it to dictionary object where key is image name and value is list of its corresponding descriptions.
- generateVocabulary(): Extract unique words from descriptions along with their frequencies.
- generateTokens(): Tokenize the descriptions and transform them to ranked sequences to be passed as input to the Decoder.

In [0]:
class textPreparation:
  def __init__(self):
    self.dirPath = "/content/drive/My Drive/NLP_Project/ProjectCode"
    self.dataPath = "/content/drive/My Drive/NLP_Project/ProjectCode/dataset"
    self.imgPath = "/content/drive/My Drive/NLP_Project/ProjectCode/dataset/Flicker8k_Dataset"
    self.modelsPath = "/content/drive/My Drive/NLP_Project/ProjectCode/models"
    # self.descFile = "Flickr8k.lemma.token.txt"
    self.descFile = "Flickr8k.token.txt"
    self.trainDataFile = "Flickr_8k.trainImages.txt"
    self.testDataFile = "Flickr_8k.testImages.txt"
  
  def getTrainImageNames(self):
    file = open(self.dataPath+"/"+self.trainDataFile, 'r')
    self.trainImgNames = []
    for line in file.read().split("\n"):
      self.trainImgNames.append(line)
    
    file.close()
    
  def preprocessData(self,description):
    description = description.strip().lower()
    descTokens = nltk.word_tokenize(description)
    description = ' '.join(descWord for descWord in descTokens if descWord not in punctuation and descWord.isalpha() and len(descWord) > 1)
    description = 'startToken ' + description + ' endToken'
    return description
  
  def prepareData(self):
    self.getTrainImageNames()
    
    # open the file as read only
    file = open(self.dataPath+"/"+self.descFile, 'r')
    # read all text
    j = 0
    
    descList = []
    self.descDict = {}
    
    descriptions = dict()
    for line in file.read().split('\n'):
      imgName = ""
      # split id from description
      if(len(line.strip())!=0 and j<5):
        desc = line.split("#"+str(j))
        
        if(len(desc) == 2):
          imgName = desc[0]
          description = desc[1]
          if imgName in self.trainImgNames:
            descList.append(self.preprocessData(description))
            
      j+=1
           
      if j == 5:
        if len(descList) != 0:
          self.descDict[imgName] = descList
          
        j = 0
        descList = []
    
    # close the file
    file.close()
    
    pickle.dump(self.descDict, open(self.modelsPath+'/trainDescription.pkl', 'wb'))
    print("done!")
    
  def generateVocabulary(self):
    words = []
    tmpVocab = []
    for key,vals in self.descDict.items():
      for val in vals:
        # Tokenize all the words in every sentence and add to tmpvocab
        tmpVocab.append(nltk.word_tokenize(val))
      
    # words is a huge list of all the words used
    for i in range(len(tmpVocab)):
      words.extend(tmpVocab[i])
    
    self.vocabulary = collections.Counter(words).most_common()
    
    pickle.dump(self.vocabulary, open(self.modelsPath+'/trainVocabulary.pkl', 'wb'))
    print("done!")
    
  def generateTokens(self):
    tokenizer = Tokenizer()
    
    descriptions = []
    for key,vals in self.descDict.items():
      for val in vals:
        descriptions.append(val)
        
    # lower integer means more frequent word (often the first few are stop words because they appear a lot).
    tokenizer.fit_on_texts(descriptions)
 
    pickle.dump(tokenizer, open(self.modelsPath+'/trainTokens.pkl', 'wb'))
    return tokenizer

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
prepareData = textPreparation()

In [0]:
prepareData.prepareData()

done!


In [0]:
data={}
file = "/content/drive/My Drive/NLP_Project/ProjectCode/models/trainDescription.pkl"
with open(file, 'rb') as f:
    data = pickle.load(f)
    print((data))

Output hidden; open in https://colab.research.google.com to view.

In [0]:
prepareData.generateVocabulary()

done!


In [0]:
data=[]
file = "/content/drive/My Drive/NLP_Project/ProjectCode/models/trainVocabulary.pkl"
with open(file, 'rb') as f:
    data = pickle.load(f)
    print(data)



In [0]:
len(data)

7266

In [0]:
# generating sequence of vocabulary word ranks
tokenizer = prepareData.generateTokens()

In [0]:
file = "/content/drive/My Drive/NLP_Project/ProjectCode/models/trainTokens.pkl"
with open(file, 'rb') as f:
    data = pickle.load(f)
    
    # testing tokenizer output for sample sentences
    print(data.texts_to_sequences(["a man in street racer armor be examine the tire of another racer motorbike"]))
    
    print(data.texts_to_sequences(["a cyclist do a flip in the air"]))
    print(data.texts_to_sequences(["startToken"]))
    print(data.texts_to_sequences(["<"]))

[[10, 3, 72, 698, 4487, 494, 2742, 4, 460, 11, 70, 698, 938]]
[[554, 650, 475, 3, 4, 63]]
[[1]]
[[]]


In [0]:
print(data)

<keras_preprocessing.text.Tokenizer object at 0x7f6a5a90a710>
