In [None]:
import synthesizer
from synthesizer import inference as sif
import numpy as np
import cv2, os
from glob import glob
import IPython

class Generator(object):
	def __init__(self):
		super(Generator, self).__init__()

		self.synthesizer = sif.Synthesizer(verbose=False)

#Just resizing the images in each window
	def read_window(self, window_fnames):
		window = []
		for fname in window_fnames:
			img = cv2.imread(fname)
			if img is None:
				raise FileNotFoundError('Frames maybe missing in {}.' 
						' Delete the video to stop this exception!'.format(sample['images']))

			img = cv2.resize(img, (sif.hparams.img_size, sif.hparams.img_size))
			window.append(img)

		images = np.asarray(window) / 255. # T x H x W x 3
		return images

#Function to save the generated audio file
	def vc(self, sample):
		hp = sif.hparams
		images = sample['images']
		all_windows = []
		i = 0
		# hp.T = 90 and i increases +75 each time so 75+90 = 165 so to include all frames for prediction we take 165 frames at a time
		while i + hp.T <= len(images):
			all_windows.append(images[i : i + hp.T])
			i += hp.T - hp.overlap

		for window_idx, window_fnames in enumerate(all_windows):
			images = self.read_window(window_fnames)

			s = self.synthesizer.synthesize_spectrograms(images)[0]
			if window_idx == 0:
				mel = s
			else:
				mel = np.concatenate((mel, s[:, hp.mel_overlap:]), axis=1)
			
		wav = self.synthesizer.griffin_lim(mel)
		wav *= 32767 / max(0.01, np.max(np.abs(wav)))
		IPython.display.display(IPython.display.Audio(wav,rate=16000,autoplay=True))
		#sif.audio.save_wav(wav, outfile, sr=hp.sample_rate)
		

#Function to get the image list for 3 seconds of video
def get_image_list(start,end,data_root):
	imagelist = []
	for i in range(start,end+1):
		try:
			imagelist.extend(list(glob(os.path.join(data_root,'{}.jpg'.format(i)))))
		except Exception as e:
			print(e)
			continue
	return imagelist

def to_sec(idx):
	frame_id = idx + 1
	sec = frame_id / float(sif.hparams.fps)
	return sec


if __name__ == '__main__':
	args = {'data_root' : 'Dataset/chem/9sDdlaBhtgk/cut-1',
           'results_root' : 'Dataset/chem/test_results1',
           'checkpoint' : 'checkpt/tacotron_model.ckpt-159000',
           'preset' : 'synthesizer/presets/chem.json'}


	## add speaker-specific parameters
	with open(args['preset']) as f:
		sif.hparams.parse_json(f.read())

	sif.hparams.set_hparam('eval_ckpt', args['checkpoint'])
	
	if not os.path.isdir(args['results_root']):
		os.mkdir(args['results_root'])
	start = 0
	while True:
		
		end = start+165
		frames = get_image_list(start,end,args['data_root'])
		if(len(frames)<165):
			print("ENDEDDDDDDDDDDD")
			break
		g = Generator()
		sample = {}
		sample['images'] = frames
		try:
			g.vc(sample)
		except KeyboardInterrupt:
			exit(0)
		except Exception as e:
			print(e)
			end +=1
			start = end
			continue
		end +=1
		start = end
