In [1]:
from __future__ import division
from torchvision import models
from torchvision import transforms
from PIL import Image
import argparse
import torch
import torchvision
import torch.nn as nn
import numpy as np

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
def load_image(image_path, transform=None, max_size=None, shape=None):
    """Load an image and convert it to a torch tensor."""
    image = Image.open(image_path)
    
    if max_size:
        scale = max_size / max(image.size)
        size = np.array(image.size) * scale
        image = image.resize(size.astype(int), Image.ANTIALIAS)
    
    if shape:
        image = image.resize(shape, Image.LANCZOS)
    
    if transform:
        image = transform(image).unsqueeze(0)
    
    return image.to(device)

In [4]:
class VGGNet(nn.Module):
    def __init__(self):
        """Select conv1_1 ~ conv5_1 activation maps."""
        super(VGGNet, self).__init__()
        self.select = ['0', '5', '10', '19', '28'] 
        self.vgg = models.vgg19(pretrained=True).features
        
    def forward(self, x):
        """Extract multiple convolutional feature maps."""
        features = []
        for name, layer in self.vgg._modules.items():
            x = layer(x)
            if name in self.select:
                features.append(x)
        return features

In [5]:
def main(content='dancing.jpg', 
         max_size=400, 
         style='picasso.jpg', 
         lr=0.003, 
         total_step=10, 
         style_weight=100, 
         log_step=1, 
         sample_step=1):
    
    # Image preprocessing
    # VGGNet was trained on ImageNet where images are normalized by mean=[0.485, 0.456, 0.406] and std=[0.229, 0.224, 0.225].
    # We use the same normalization statistics here.
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.485, 0.456, 0.406), 
                             std=(0.229, 0.224, 0.225))])
    print("Transformer loaded...")
    
    # Load content and style images
    # Make the style image same size as the content image
    content = load_image(content, transform, max_size=max_size)
    style = load_image(style, transform, shape=[content.size(2), content.size(3)])
    print("Image loaded...")
    # Initialize a target image with the content image
    target = content.clone().requires_grad_(True)
    
    optimizer = torch.optim.Adam([target], lr=lr, betas=[0.5, 0.999])
    vgg = VGGNet().to(device).eval()
    print("Net loaded...")
    for step in range(total_step):
        print(step, "epochs started")
        # Extract multiple(5) conv feature vectors
        target_features = vgg(target)
        content_features = vgg(content)
        style_features = vgg(style)

        style_loss = 0
        content_loss = 0
        for f1, f2, f3 in zip(target_features, content_features, style_features):
            # Compute content loss with target and content images
            content_loss += torch.mean((f1 - f2)**2)

            # Reshape convolutional feature maps
            _, c, h, w = f1.size()
            f1 = f1.view(c, h * w)
            f3 = f3.view(c, h * w)

            # Compute gram matrix
            f1 = torch.mm(f1, f1.t())
            f3 = torch.mm(f3, f3.t())

            # Compute style loss with target and style images
            style_loss += torch.mean((f1 - f3)**2) / (c * h * w) 
        
        # Compute total loss, backprop and optimize
        loss = content_loss + style_weight * style_loss 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (step+1) % log_step == 0:
            print ('Step [{}/{}], Content Loss: {:.4f}, Style Loss: {:.4f}' 
                   .format(step+1, total_step, content_loss.item(), style_loss.item()))

        if (step+1) % sample_step == 0:
            # Save the generated image
            denorm = transforms.Normalize((-2.12, -2.04, -1.80), (4.37, 4.46, 4.44))
            img = target.clone().squeeze()
            img = denorm(img).clamp_(0, 1)
            torchvision.utils.save_image(img, 'output-{}.png'.format(step+1))


In [6]:
main()

Transformer loaded...
Image loaded...
Net loaded...
0 epochs started
Step [1/10], Content Loss: 0.0000, Style Loss: 97.3535
1 epochs started
Step [2/10], Content Loss: 0.1008, Style Loss: 92.8298
2 epochs started
Step [3/10], Content Loss: 0.3505, Style Loss: 88.7080
3 epochs started
Step [4/10], Content Loss: 0.6995, Style Loss: 84.7332
4 epochs started
Step [5/10], Content Loss: 1.1203, Style Loss: 80.8329
5 epochs started
Step [6/10], Content Loss: 1.5881, Style Loss: 77.0198
6 epochs started


KeyboardInterrupt: 

In [7]:
vgg = VGGNet().to(device).eval()

In [16]:
content='dancing.jpg'
max_size=400
style='picasso.jpg'
lr=0.003
total_step=10
style_weight=100
log_step=1
sample_step=1

In [17]:
transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.485, 0.456, 0.406), 
                             std=(0.229, 0.224, 0.225))])

In [18]:
content = load_image(content, transform, max_size=max_size)
style = load_image(style, transform, shape=[content.size(2), content.size(3)])
target = content.clone().requires_grad_(True)

In [20]:
target_features = vgg(target)