<a href="https://colab.research.google.com/github/afiaka87/deep-daze/blob/notebook/Text2Image_Siren%2BPhrase_Min.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text to Image

Based on: 
[CLIP](https://github.com/openai/CLIP) + [SIREN](https://github.com/vsitzmann/siren), colabs by [Ryan Murdock](https://rynmurdock.github.io/) and [@tg-bomze](https://github.com/tg-bomze)  
by [eps696](https://github.com/eps696)

## Features (optional)
* using image and/or text as prompts
* processing input coords with [Fourier feature mapping](https://github.com/tancik/fourier-feature-networks), making elements finer
* few sampling modes, to play with composition


**Run this cell after each session restart**

In [None]:
#@title Run then restart.

import subprocess
CUDA_version = [s for s in subprocess.check_output(["nvcc", "--version"]).decode("UTF-8").split(", ") if s.startswith("release")][0].split(" ")[-1]
print("CUDA version:", CUDA_version)

if CUDA_version == "10.0":
    torch_version_suffix = "+cu100"
elif CUDA_version == "10.1":
    torch_version_suffix = "+cu101"
elif CUDA_version == "10.2":
    torch_version_suffix = ""
else:
    torch_version_suffix = "+cu110"

!pip install torch==1.7.1{torch_version_suffix} torchvision==0.8.2{torch_version_suffix} -f https://download.pytorch.org/whl/torch_stable.html ftfy regex


In [2]:
#@title General setup



try: 
  !pip3 install googletrans==3.1.0a0
  from googletrans import Translator, constants
  # from pprint import pprint
  translator = Translator()
except: pass
!pip install ftfy

import os
import time
import random
import imageio
import numpy as np
import PIL
from skimage import exposure
from base64 import b64encode

import torch
import torch.nn as nn
import torchvision

from IPython.display import HTML, Image, display, clear_output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import ipywidgets as ipy
# import glob
from google.colab import output, files

# from google.colab import files
import warnings
warnings.filterwarnings("ignore")

!git clone https://github.com/openai/CLIP.git
%cd /content/CLIP/
import clip
perceptor, preprocess = clip.load('ViT-B/32')

workdir = '_out'
tempdir = os.path.join(workdir, 'ttt')
os.makedirs(tempdir, exist_ok=True)

clear_output()

# # Libs

class SineLayer(nn.Module):
  def __init__(self, in_features, out_features, bias=True, is_first=False, omega_0=30):
    super().__init__()
    self.omega_0 = omega_0
    self.is_first = is_first
    self.in_features = in_features
    self.linear = nn.Linear(in_features, out_features, bias=bias)
    self.init_weights()
  
  def init_weights(self):
    with torch.no_grad():
      if self.is_first:
        lim = 1 / self.in_features
      else:
        lim = np.sqrt(6 / self.in_features) / self.omega_0
      self.linear.weight.uniform_(-lim, lim)
      
  def forward(self, input):
    return torch.sin(self.omega_0 * self.linear(input))
    
class Siren(nn.Module):
  def __init__(self, in_features, hidden_features, hidden_layers, out_features, outermost_linear=True, 
                first_omega_0=30, hidden_omega_0=30.):
    super().__init__()
      
    self.net = []
    self.net.append(SineLayer(in_features, hidden_features, is_first=True, omega_0=first_omega_0))

    for i in range(hidden_layers):
      self.net.append(SineLayer(hidden_features, hidden_features, is_first=False, omega_0=hidden_omega_0))

    if outermost_linear:
      final_linear = nn.Linear(hidden_features, out_features)
      with torch.no_grad():
        lim = np.sqrt(6 / hidden_features) / hidden_omega_0
        final_linear.weight.uniform_(-lim, lim)
      self.net.append(final_linear)
    else:
      self.net.append(SineLayer(hidden_features, out_features, is_first=False, omega_0=hidden_omega_0))
    
    self.net = nn.Sequential(*self.net)
  
  def forward(self, coords):
    coords = coords.clone().detach().requires_grad_(True)
    output = self.net(coords.cuda())
    return output.view(1, sideY, sideX, 3).permute(0, 3, 1, 2)#.sigmoid_()

def get_mgrid(sideX, sideY):
  tensors = [np.linspace(-1, 1, num=sideY), np.linspace(-1, 1, num=sideX)]
  mgrid = np.stack(np.meshgrid(*tensors), axis=-1)
  mgrid = mgrid.reshape(-1, 2) # dim 2
  return mgrid

# Preprocessing coords with Fourier feature mapping
def fourierfm(xy, map=256, fourier_scale=4, mapping_type='gauss'):

  def input_mapping(x, B): # feature mappings
    x_proj = (2.*np.pi*x) @ B
    y = np.concatenate([np.sin(x_proj), np.cos(x_proj)], axis=-1)
    print(' mapping input:', x.shape, 'output', y.shape)
    return y

  if mapping_type == 'gauss': # Gaussian Fourier feature mappings
    B = np.random.randn(2, map) 
    B *= fourier_scale # scale Gauss
  else: # basic
    B = np.eye(2).T

  xy = input_mapping(xy, B)
  return xy

def slice_imgs(imgs, count, transform=None, uniform=False):
  def map(x, a, b):
    return x * (b-a) + a
  rnd_size = torch.rand(count)
  rnd_offx = torch.rand(count)
  rnd_offy = torch.rand(count)
  
  sz = [img.shape[2:] for img in imgs]
  sz_min = [np.min(s) for s in sz]
  if uniform is True:
    upsize = [[2*s[0], 2*s[1]] for s in list(sz)]
    imgs = [pad_up_to(imgs[i], upsize[i], type='centr') for i in range(len(imgs))]

  sliced = []
  for i, img in enumerate(imgs):
    cuts = []
    for c in range(count):
      csize = map(rnd_size[c], 0.5*sz_min[i], 0.98*sz_min[i]).int()
      if uniform is True:
        offsetx = map(rnd_offx[c], sz[i][1] - csize, 2* sz[i][1] - csize).int()
        offsety = map(rnd_offy[c], sz[i][0] - csize, 2* sz[i][0] - csize).int()
      else:
        offsetx = map(rnd_offx[c], 0, sz[i][1] - csize).int()
        offsety = map(rnd_offy[c], 0, sz[i][0] - csize).int()
      cut = img[:, :, offsety:offsety + csize, offsetx:offsetx + csize]
      cut = torch.nn.functional.interpolate(cut, (224,224), mode='bilinear')
      if transform is not None: 
          cut = transform(cut)
      cuts.append(cut)
    sliced.append(torch.cat(cuts, 0))
  return sliced

def makevid(seq_dir, size=None):
  out_sequence = seq_dir + '/%03d.jpg'
  out_video = seq_dir + '.mp4'
  !ffmpeg -y -v warning -i $out_sequence $out_video
  data_url = "data:video/mp4;base64," + b64encode(open(out_video,'rb').read()).decode()
  wh = '' if size is None else 'width=%d height=%d' % (size, size)
  return """<video %s controls><source src="%s" type="video/mp4"></video>""" % (wh, data_url)

# Tiles an array around two points, allowing for pad lengths greater than the input length
# adapted from https://discuss.pytorch.org/t/symmetric-padding/19866/3
def tile_pad(xt, padding):
  h, w = xt.shape[-2:]
  left, right, top, bottom = padding

  def tile(x, minx, maxx):
    rng = maxx - minx
    mod = np.remainder(x - minx, rng)
    out = mod + minx
    return np.array(out, dtype=x.dtype)

  x_idx = np.arange(-left, w+right)
  y_idx = np.arange(-top, h+bottom)
  x_pad = tile(x_idx, -0.5, w-0.5)
  y_pad = tile(y_idx, -0.5, h-0.5)
  xx, yy = np.meshgrid(x_pad, y_pad)
  return xt[..., yy, xx]

def pad_up_to(x, size, type='centr'):
  sh = x.shape[2:][::-1]
  if list(x.shape[2:]) == list(size): return x
  padding = []
  for i, s in enumerate(size[::-1]):
    if 'side' in type.lower():
      padding = padding + [0, s-sh[i]]
    else: # centr
      p0 = (s-sh[i]) // 2
      p1 = s-sh[i] - p0
      padding = padding + [p0,p1]
  y = tile_pad(x, padding)
  return y

class ProgressBar(object):
  def __init__(self, task_num=10):
    self.pbar = ipy.IntProgress(min=0, max=task_num, bar_style='') # (value=0, min=0, max=max, step=1, description=description, bar_style='')
    self.labl = ipy.Label()
    display(ipy.HBox([self.pbar, self.labl]))
    self.task_num = task_num
    self.completed = 0
    self.start()

  def start(self, task_num=None):
    if task_num is not None:
      self.task_num = task_num
    if self.task_num > 0:
      self.labl.value = '0/{}'.format(self.task_num)
    else:
      self.labl.value = 'completed: 0, elapsed: 0s'
    self.start_time = time.time()

  def upd(self, *p, **kw):
    self.completed += 1
    elapsed = time.time() - self.start_time + 0.0000000000001
    fps = self.completed / elapsed if elapsed>0 else 0
    if self.task_num > 0:
      finaltime = time.asctime(time.localtime(self.start_time + self.task_num * elapsed / float(self.completed)))
      fin = ' end %s' % finaltime[11:16]
      percentage = self.completed / float(self.task_num)
      eta = int(elapsed * (1 - percentage) / percentage + 0.5)
      self.labl.value = '{}/{}, rate {:.3g}s, time {}s, left {}s, {}'.format(self.completed, self.task_num, 1./fps, shortime(elapsed), shortime(eta), fin)
    else:
      self.labl.value = 'completed {}, time {}s, {:.1f} steps/s'.format(self.completed, int(elapsed + 0.5), fps)
    self.pbar.value += 1
    if self.completed == self.task_num: self.pbar.bar_style = 'success'
    return 
    # return self.completed

def time_days(sec):
  return '%dd %d:%02d:%02d' % (sec/86400, (sec/3600)%24, (sec/60)%60, sec%60)
def time_hrs(sec):
  return '%d:%02d:%02d' % (sec/3600, (sec/60)%60, sec%60)
def shortime(sec):
  if sec < 60:
    time_short = '%d' % (sec)
  elif sec < 3600:
    time_short  = '%d:%02d' % ((sec/60)%60, sec%60)
  elif sec < 86400:
    time_short  = time_hrs(sec)
  else:
    time_short = time_days(sec)
  return time_short


!nvidia-smi -L
print('\nDone!')

GPU 0: Tesla V100-SXM2-16GB (UUID: GPU-4cb44cc0-8776-c2ca-b472-3af210152622)

Done!


Type some text to hallucinate it, or upload some image to neuremix it.  
Or use both, why not.

Phrase will be minimized more and more as iterarion count goes up so as not to impact the original phrase too much.

In [3]:
#@title Input

text = "a detailed photo of a beautiful sunny day in the park" #@param {type:"string"}
text_to_minimize = "blur" #@param {type:"string"}
translate = False #@param {type:"boolean"}
#@markdown or 
upload_image = False #@param {type:"boolean"}

if translate:
  text = translator.translate(text, dest='en').text
if upload_image:
  uploaded = files.upload()

Decrease `samples` (amount of random image cuts, trained per step) and/or `siren_layers` (depth/quality of the generator network), if facing OOM for higher resolutions. Try different `fourier_scale` values (shift to details), if using Fourier mapping option.  
`sync_cut` option kinda makes it follow uploaded image composition (if there's any). `uniform` results in a more randomly tiled texture.

In [None]:
#@title Generate
from google.colab import drive
from datetime import datetime
dtNow = datetime.now()

drive.mount('/content/GDrive')
clipsDir = '/content/GDrive/MyDrive/T2I ' + dtNow.strftime("%Y-%m-%d %H%M")

!rm -rf tempdir

sideX = 640 #@param {type:"integer"}
sideY = 480 #@param {type:"integer"}
uniform = True #@param {type:"boolean"}
sync_cut = True #@param {type:"boolean"}
#@markdown > Training
steps = 500 #@param {type:"integer"}
save_freq = 1 #@param {type:"integer"}
learning_rate = .0001 #@param {type:"number"}
samples = 60 #@param {type:"integer"}
#@markdown > Network
siren_layers = 12 #@param {type:"integer"}
use_fourier_feat_map = True #@param {type:"boolean"}
fourier_maps = 128 #@param {type:"integer"}
fourier_scale = 4 #@param {type:"number"}
#@markdown > Misc
audio_notification = True #@param {type:"boolean"}

out_name = text.replace(' ', '_')

mgrid = get_mgrid(sideY, sideX) # [262144,2]
if use_fourier_feat_map:
  mgrid = fourierfm(mgrid, fourier_maps, fourier_scale)
mgrid = torch.from_numpy(mgrid.astype(np.float32)).cuda()

model = Siren(mgrid.shape[-1], 256, siren_layers, 3).cuda()

norm_in = torchvision.transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))

img_enc = None
if upload_image:
  input = list(uploaded.values())[0]
  print(list(uploaded)[0])
  img_in = torch.from_numpy(imageio.imread(input).astype(np.float32)/255.).unsqueeze(0).permute(0,3,1,2).cuda()
  if sync_cut is True:
    samples = samples // 2
  else:
    in_sliced = slice_imgs([img_in], samples, transform=norm_in, uniform=uniform)[0]
    img_enc = perceptor.encode_image(in_sliced).detach().clone()
    del img_in, in_sliced; torch.cuda.empty_cache()

if len(text) > 2:
  print(text)
  if translate:
    translator = Translator()
    text = translator.translate(text, dest='en').text
    print(' translated to:', text) 
  tx = clip.tokenize(text)
  txt_enc = perceptor.encode_text(tx.cuda()).detach().clone()
  min_txt_tokenized = clip.tokenize(text_to_minimize)
  min_txt_clip_encoding = perceptor.encode_text(min_txt_tokenized.cuda()).detach().clone()

optimizer = torch.optim.Adam(model.parameters(), learning_rate)

def displ(img, fname=None):
  img = np.array(img)[:,:,:]
  img = np.transpose(img, (1,2,0))  
  img = exposure.equalize_adapthist(np.clip(img, -1., 1.))
  img = np.clip(img*255, 0, 255).astype(np.uint8)
  if fname is not None:
    imageio.imsave(fname, np.array(img))
    imageio.imsave('result.jpg', np.array(img))

def checkin(num):
  with torch.no_grad():
    img = model(mgrid).cpu().numpy()[0]
  displ(img, os.path.join(tempdir, '%03d.jpg' % num))
  outpic.clear_output()
  with outpic:
    display(Image('result.jpg'))

def train(i, img_enc):
  img_out = model(mgrid)
  if upload_image and sync_cut is True:
    imgs_sliced = slice_imgs([img_in, img_out], samples, norm_in, uniform)
    img_enc = perceptor.encode_image(imgs_sliced[0])
  else:
    imgs_sliced = slice_imgs([img_out], samples, norm_in, uniform)
  gen_img_clip_enc = perceptor.encode_image(imgs_sliced[-1])
  loss = 0
  if upload_image:
    loss += -100*torch.cosine_similarity(img_enc, gen_img_clip_enc, dim=-1).mean()
  if len(text) > 2:
    loss += -100*torch.cosine_similarity(txt_enc, gen_img_clip_enc, dim=-1).mean()
  if len(text_to_minimize) > 2:
    # Should cause the loss to incrementally increase to 100 from 0
    step_fraction = i / steps
    loss += step_fraction*100*torch.cosine_similarity(min_txt_clip_encoding, gen_img_clip_enc, dim=-1).mean()
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()
  
  if i % save_freq == 0:
    checkin(i // save_freq)

outpic = ipy.Output()
outpic

pbar = ProgressBar(steps)
for i in range(steps):
  train(i, img_enc)
  _ = pbar.upd()

HTML(makevid(tempdir))
if audio_notification == True: output.eval_js('new Audio("https://freesound.org/data/previews/80/80921_1022651-lq.ogg").play()')


Drive already mounted at /content/GDrive; to attempt to forcibly remount, call drive.mount("/content/GDrive", force_remount=True).
 mapping input: (307200, 2) output (307200, 256)
the text black lives matter


Output()

HBox(children=(IntProgress(value=0, max=500), Label(value='')))