## Build Sequence Processor

In [1]:
import nltk
import string
from string import punctuation
from keras.preprocessing.text import Tokenizer
nltk.download('punkt')
import collections
import pickle

Using TensorFlow backend.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Class Functions:

- init(): Sets the directory paths for the images and the captions, Inilialize the VGG model.
- getTrainImageNames(): Creates a list of all the image names.
- preprocessData: Clean the data. Lowercase the captions, remove punctuations and keep data alphanumeric.
- prepareData(): Method for reading descriptions and adding it to dictionary object where key is image name and value is list of its corresponding descriptions.
- generateVocabulary(): Extract unique words from descriptions along with their frequencies.
- generateTokens(): Tokenize the descriptions and transform them to ranked sequences to be passed as input to the Decoder.

In [0]:
class textPreparation:
  def __init__(self):
    self.dirPath = "/content/drive/My Drive/NLP_Project/ProjectCode"
    self.dataPath = "/content/drive/My Drive/NLP_Project/ProjectCode/dataset"
    self.imgPath = "/content/drive/My Drive/NLP_Project/ProjectCode/dataset/Flicker8k_Dataset"
    self.modelsPath = "/content/drive/My Drive/NLP_Project/ProjectCode/models"
    self.descFile = "Flickr8k.lemma.token.txt"
    self.trainDataFile = "Flickr_8k.trainImages.txt"
    self.testDataFile = "Flickr_8k.testImages.txt"
  
  def getTrainImageNames(self):
    file = open(self.dataPath+"/"+self.trainDataFile, 'r')
    self.trainImgNames = []
    for line in file.read().split("\n"):
      self.trainImgNames.append(line)
    
    file.close()
    
  def preprocessData(self,description):
    description = description.strip().lower()
    descTokens = nltk.word_tokenize(description)
    description = ' '.join(descWord for descWord in descTokens if descWord not in punctuation and descWord.isalpha())
    return description
  
  def prepareData(self):
    self.getTrainImageNames()
    
    # open the file as read only
    file = open(self.dataPath+"/"+self.descFile, 'r')
    # read all text
    j = 0
    
    descList = []
    self.descDict = {}
    
    descriptions = dict()
    for line in file.read().split('\n'):
      imgName = ""
      # split id from description
      if(len(line.strip())!=0 and j<5):
        desc = line.split("#"+str(j))
        
        if(len(desc) == 2):
          imgName = desc[0]
          description = desc[1]
          if imgName in self.trainImgNames:
            descList.append(self.preprocessData(description))
            
      j+=1
           
      if j == 5:
        if len(descList) != 0:
          self.descDict[imgName] = descList
          
        j = 0
        descList = []
    
    # close the file
    file.close()
    
    pickle.dump(self.descDict, open(self.modelsPath+'/trainDescription.pkl', 'wb'))
    print("done!")
    
  def generateVocabulary(self):
    words = []
    tmpVocab = []
    for key,vals in self.descDict.items():
      for val in vals:
        tmpVocab.append(nltk.word_tokenize(val))
      
    for i in range(len(tmpVocab)):
      words.extend(tmpVocab[i])
    
    self.vocabulary = collections.Counter(words).most_common()
    
    pickle.dump(self.vocabulary, open(self.modelsPath+'/trainVocabulary.pkl', 'wb'))
    print("done!")
    
  def generateTokens(self):
    tokenizer = Tokenizer()
    
    descriptions = []
    for key,vals in self.descDict.items():
      for val in vals:
        descriptions.append(val)
        
    tokenizer.fit_on_texts(descriptions)
 
    pickle.dump(tokenizer, open(self.modelsPath+'/trainTokens.pkl', 'wb'))
    return tokenizer

In [0]:
prepareData = textPreparation()

In [4]:
prepareData.prepareData()

done!


In [5]:
file = "/content/drive/My Drive/NLP_Project/ProjectCode/models/trainDescription.pkl"
with open(file, 'rb') as f:
    data = pickle.load(f)
    print(data)

{'1305564994_00513f9a5b.jpg': ['a man in street racer armor be examine the tire of another racer motorbike', 'two racer drive a white bike down a road', 'two motorist be ride along on their vehicle that be oddly design and color', 'two person be in a small race car drive by a green hill', 'two person in race uniform in a street car'], '1351764581_4d4fb1b40f.jpg': ['a firefighter extinguish a fire under the hood of a car', 'a fireman spray water into the hood of small white car on a jack', 'a fireman spray inside the open hood of small white car on a jack', 'a fireman use a firehose on a car engine that be up on a carjack', 'firefighter use water to extinguish a car that be on fire'], '1358089136_976e3d2e30.jpg': ['a boy sand surf down a hill', 'a man be attempt to surf down a hill make of sand on a sunny day', 'a man be slide down a huge sand dune on a sunny day', 'a man be surf down a hill of sand', 'a young man in short and be snowboard under a bright blue sky'], '1362128028_8422d53d

In [6]:
prepareData.generateVocabulary()

done!


In [7]:
file = "/content/drive/My Drive/NLP_Project/ProjectCode/models/trainVocabulary.pkl"
with open(file, 'rb') as f:
    data = pickle.load(f)
    print(data)

[('a', 51584), ('in', 14085), ('be', 9835), ('on', 8008), ('the', 7903), ('dog', 7779), ('and', 6674), ('man', 6167), ('with', 5763), ('of', 4967), ('two', 4244), ('boy', 3119), ('girl', 3050), ('woman', 3023), ('white', 2922), ('person', 2893), ('black', 2854), ('run', 2596), ('play', 2466), ('wear', 2399), ('stand', 2347), ('to', 2304), ('jump', 2151), ('at', 2102), ('water', 2050), ('young', 2009), ('brown', 1995), ('red', 1977), ('child', 1957), ('an', 1914), ('sit', 1734), ('his', 1711), ('blue', 1668), ('through', 1547), ('walk', 1503), ('shirt', 1464), ('while', 1443), ('hold', 1416), ('ball', 1380), ('down', 1365), ('little', 1266), ('ride', 1243), ('grass', 1225), ('snow', 1181), ('look', 1120), ('over', 1062), ('three', 1047), ('field', 1000), ('front', 995), ('small', 989), ('yellow', 930), ('green', 924), ('large', 921), ('group', 913), ('by', 903), ('up', 903), ('one', 870), ('her', 862), ('mouth', 776), ('air', 769), ('player', 761), ('into', 757), ('rock', 757), ('near',

In [0]:
# generating sequence of vocabulary word ranks
tokenizer = prepareData.generateTokens()

In [9]:
file = "/content/drive/My Drive/NLP_Project/ProjectCode/models/trainTokens.pkl"
with open(file, 'rb') as f:
    data = pickle.load(f)
    
    # testing tokenizer output for sample sentences
    print(data.texts_to_sequences(["a man in street racer armor be examine the tire of another racer motorbike"]))
    
    print(data.texts_to_sequences(["a cyclist do a flip in the air"]))

[[1, 8, 2, 68, 510, 3313, 3, 1266, 5, 364, 10, 69, 510, 711]]
[[1, 408, 88, 1, 345, 2, 5, 60]]
