# Initializations

In [None]:
import numpy as np
import pandas as pd

import sys, os, time
import glob

from matplotlib import pyplot as plt
%matplotlib inline

# these magics ensure that external modules that are modified are also automatically reloaded
%load_ext autoreload
%autoreload 2

# widgets and interaction
from ipywidgets import FloatProgress
from IPython.display import display, clear_output

import seaborn as sns
sns.set_style("whitegrid", {'axes.grid' : False})

from skimage.io import imread, imsave

import warnings
warnings.filterwarnings('ignore')

In [None]:
from collections import namedtuple
import random

opt = [
 ("dataroot", "/home/adalbert/data/world-cities/"), 
 ("workers", 2),
 ("batchSize",64), 
 ("imageSize",64),
 ("nz",100),
 ("ngf",64), # nr filters for generator
 ("ndf",64), # nr filters for discriminator
 ("niter",25),
 ("lr",0.0002),
 ("beta1",0.5), 
 ("cuda",True),
 ("ngpu",1),
 ("netG",""),
 ("netD",""),
 ("outf","/home/adalbert/nbserver/pytorch-workspace/dcgan/"),
 ("manualSeed",  random.randint(1, 10000)) 
]
opt = namedtuple("opt", dict(opt).keys())(**dict(opt))

opt

# Set up data sources

In [None]:
import glob

files = glob.glob(opt.dataroot + "*/*/*/*.png")
files_df = []
for f in files:
    s = f.split("/")
    fname, cls, res, scale = \
        s[-1], s[-2], int(s[-3].split("-")[0]), int(s[-4].split("-")[0])
    _,country,city,_,pop,lat,lon = fname.split("_")
    files_df.append((f, cls, res, scale, country, pop))
    
files_df = pd.DataFrame(files_df, \
                columns=["filename", "class", "res-px", "scale-km", "country", "population"])

In [None]:
len(files_df)

In [None]:
sel_df = files_df[(files_df['res-px']==224) & 
                  (files_df['scale-km']==100)]

idx = np.random.choice(range(len(sel_df)), int(len(sel_df)*0.95), replace=False)
train_df = sel_df.iloc[idx]
test_df  = sel_df.iloc[list(set(range(len(sel_df))) - set(idx))]

train_df.to_csv(opt.dataroot + "/train.csv")
test_df.to_csv(opt.dataroot + "/test.csv")

len(train_df), len(test_df)

# Train - spawn external process

In [None]:
script_path = "../models/dcgan-mod.py"
train_file = "/home/adalbert/data/world-cities/train.csv"
outfolder = opt.outf + "/100km/"

cmd = "python {0} --cuda --dataset=csvfile --dataroot={1} --niter={2} --ngpu={3} --outf={4} --imageSize={5}".format(script_path, train_file, opt.niter, opt.ngpu, outfolder, opt.imageSize)
cmd

In [None]:
# os.system(cmd)

In [None]:
from PIL import Image
pimg = Image.open(train_df['filename'].iloc[0]).convert("L")
img = np.array(pimg)
img[abs(img-0.5)<0.01] = 1
pimg = Image.fromarray(np.uint8(img))

In [None]:
plt.imshow(img)

In [None]:
d.getband(0)

# Plot fake and real samples

In [None]:
img = imread(train_df.sample()['filename'].iloc[0])
img = img / float(img.max())
img = 1-img

In [None]:
plt.imshow(1-img)

In [None]:
files_fake = glob.glob(opt.outf + "/100km/fake*.png")
files_fake.sort()

files_real = glob.glob(opt.outf + "/100km/real*.png")

files_cptD = glob.glob(opt.outf + "/100km/netD*.pth")
files_cptD.sort()
files_cptG = glob.glob(opt.outf + "/100km/netG*.pth")
files_cptG.sort()


In [None]:
plt.imshow(imread(files_real[0]))

In [None]:
for f in files_fake:
    img = imread(f)
    plt.imshow(img)
    plt.title(os.path.basename(f))
    plt.show()

In [None]:
img = imread(opt.outf + "/100km/training_progress.jpg")

plt.imshow(img)

# Clustering with GAN generator features

In [None]:
files_cptD = {int(os.path.basename(f).split(".")[0].split("_")[-1]):f \
              for f in files_cptD}

files_cptG = {int(os.path.basename(f).split(".")[0].split("_")[-1]):f \
              for f in files_cptG}

In [None]:
ngpu = int(opt.ngpu)
nz = int(opt.nz)
ngf = int(opt.ngf)
ndf = int(opt.ndf)
nc = 1

In [None]:
import torch.nn as nn
import torch.nn.parallel
import torchvision.transforms as transforms

def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        m.weight.data.normal_(0.0, 0.02)
    elif classname.find('BatchNorm') != -1:
        m.weight.data.normal_(1.0, 0.02)
        m.bias.data.fill_(0)

class _netD(nn.Module):
    def __init__(self, ngpu):
        super(_netD, self).__init__()
        self.ngpu = ngpu
        self.main = nn.Sequential(
            # input is (nc) x 64 x 64
            nn.Conv2d(nc, ndf, 4, 2, 1, bias=False),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf) x 32 x 32
            nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 2),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*2) x 16 x 16
            nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 4),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*4) x 8 x 8
            nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 8),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*8) x 4 x 4
            nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
            nn.Sigmoid()
        )
    def forward(self, input):
        gpu_ids = None
        if isinstance(input.data, torch.cuda.FloatTensor) and self.ngpu > 1:
            gpu_ids = range(self.ngpu)
        output = nn.parallel.data_parallel(self.main, input, gpu_ids)
        return output.view(-1, 1)
    
netD = _netD(ngpu)
netD.apply(weights_init)

class _netG(nn.Module):
    def __init__(self, ngpu):
        super(_netG, self).__init__()
        self.ngpu = ngpu
        self.main = nn.Sequential(
            # input is Z, going into a convolution
            nn.ConvTranspose2d(     nz, ngf * 8, 4, 1, 0, bias=False),
            nn.BatchNorm2d(ngf * 8),
            nn.ReLU(True),
            # state size. (ngf*8) x 4 x 4
            nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 4),
            nn.ReLU(True),
            # state size. (ngf*4) x 8 x 8
            nn.ConvTranspose2d(ngf * 4, ngf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 2),
            nn.ReLU(True),
            # state size. (ngf*2) x 16 x 16
            nn.ConvTranspose2d(ngf * 2,     ngf, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf),
            nn.ReLU(True),
            # state size. (ngf) x 32 x 32
            nn.ConvTranspose2d(    ngf,      nc, 4, 2, 1, bias=False),
            nn.Tanh()
            # state size. (nc) x 64 x 64
        )
    def forward(self, input):
        gpu_ids = None
        if isinstance(input.data, torch.cuda.FloatTensor) and self.ngpu > 1:
            gpu_ids = range(self.ngpu)
        return nn.parallel.data_parallel(self.main, input, gpu_ids)

netG = _netG(ngpu)
netG.apply(weights_init)


In [None]:
netD.load_state_dict(torch.load(files_cptD[max(files_cptD.keys())]))

feature_extractor = nn.Sequential(*list(list(netD.children())[0].children())[:-2])

In [None]:
sys.path.append("../pytorch_utils")
from loader_dataframe import ImageDataFrame, grayscale_loader

dataset = ImageDataFrame(df=test_df,
                         loader=grayscale_loader,
                         transform=transforms.Compose([
                               transforms.Scale(opt.imageSize),
                               transforms.CenterCrop(opt.imageSize),
                               transforms.ToTensor(),
                               transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
                           ]))

assert dataset

dataloader = torch.utils.data.DataLoader(dataset, batch_size=opt.batchSize,
                                     shuffle=False, num_workers=int(opt.workers))


In [None]:
from torch.autograd import Variable

input = torch.FloatTensor(opt.batchSize, nc, opt.imageSize, opt.imageSize)

if opt.cuda:
    netD.cuda()
    input = input.cuda()

input = Variable(input)

In [None]:
labels = []
features = []
for i, data in enumerate(dataloader):
    netD.zero_grad()
    real_cpu, lab_batch = data
    batch_size = real_cpu.size(0)
    input.data.resize_(real_cpu.size()).copy_(real_cpu)
    feat_batch = feature_extractor(input)
    feat_batch = feat_batch.data.cpu().numpy().reshape((batch_size,-1))
    
    features.append(feat_batch)
    labels.append(lab_batch.numpy())

features = np.vstack(features)
labels = np.hstack(labels)

In [None]:
from sklearn import decomposition

pca = decomposition.PCA(n_components=50)

feat_reduced = pca.fit_transform(features)[:,:40]

plt.figure(figsize=(4,4))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.title("PCA Components Variance Explained")
plt.xlabel("# components")
plt.ylabel("% variance")

In [None]:
# # https://github.com/DmitryUlyanov/Multicore-TSNE

# from MulticoreTSNE import MulticoreTSNE as TSNE

# tsne = TSNE(n_components=20, perplexity=30)
# feats_tsne = tsne.fit_transform(features.astype(np.float64))

In [None]:
from sklearn.cluster import KMeans
loss_vec = []
k_vec = np.linspace(5, 150, 40)
for k in k_vec:
    print int(k),
    kmeans = KMeans(n_clusters=int(k), random_state=0).fit(feat_reduced)
    loss = -kmeans.score(feat_reduced)
    loss_vec.append(loss)

In [None]:
plt.figure(figsize=(4,4))
plt.plot(k_vec, np.array(loss_vec)/1e6)
plt.title("K-Means loss vs # clusters")
plt.xlabel("# Clusters")
plt.ylabel("loss (/1e6)")

In [None]:
kmeans = KMeans(n_clusters=25, random_state=0).fit(feat_reduced)

C = kmeans.predict(feat_reduced)

In [None]:
(pd.Series(C).value_counts() / float(len(C))).plot(kind="barh", figsize=(4,4))
plt.title("Cluster Membership Distribution")
plt.xlabel("pct membership")
plt.ylabel("cluster ID")

In [None]:
def plot_examples(image_paths, labels, classes=None, \
                  nExamples=10, thumbSize = (64,64), title="example"):
    # build example canvass 
    from skimage.transform import resize
    from skimage.io import imread
    
    clustLabels = np.unique(labels)
    nClusters = clustLabels.size
    canvas = np.zeros((thumbSize[0]*nClusters, nExamples*thumbSize[1]))
    for i,c in enumerate(clustLabels):
        cur_class_samples = np.where(labels==c)[0]
        idx = np.random.choice(cur_class_samples, replace=False, size=min([nExamples, len(cur_class_samples)]))
        for j in range(len(idx)):
            img = imread(image_paths[idx[j]])
            img = img / float(img.max())
            img[abs(img-0.5)<0.01] = 0 # hack to remove no-data patches
            img = 1-img
            img = resize(img, thumbSize)
            canvas[i*thumbSize[0]:(i+1)*thumbSize[0], j*thumbSize[1]:(j+1)*thumbSize[1]] = img
    
    # plot examples of each class
    fig,ax = plt.subplots(1, figsize=(12,10))
    plt.tight_layout()
    print canvas.shape
    ax.imshow(canvas.swapaxes(0,1))#, aspect='auto')
    ax.set_title(title, fontsize=18)
    ax.set_ylabel("-- examples --", fontsize=16)
    ax.set_xlabel("-- land classes --", fontsize=16)
    # Turn off tick labels
    if classes is None: classes = clustLabels
    ax.set_xticks([thumbSize[0]*(0.5 + x) for x in range(nClusters)])
    ax.set_xticklabels(classes, fontsize=16, rotation=90)
    ax.set_yticklabels([])
    #plt.axis("off")
    plt.show()

In [None]:
plot_examples(test_df['filename'].values, C)

In [None]:
test_df.head()