### Imports

In [0]:
# don't forget to press "mount drive" on the left!

In [0]:
!pip install ImageScraper # this is an extension that scrapes images from web pages

Collecting ImageScraper
  Downloading https://files.pythonhosted.org/packages/43/4b/e1e2af3b0892cf55fe4db06ce16e6d1d41c6bb95ae208b52109316da948c/ImageScraper-2.0.7-py2.py3-none-any.whl
Collecting SimplePool
  Downloading https://files.pythonhosted.org/packages/5f/05/1caf229f0baccbbc01978b4c77269e602125815403a7fb1079e63b83be05/SimplePool-0.1.tar.gz
Collecting setproctitle>=1.1.8
  Downloading https://files.pythonhosted.org/packages/5a/0d/dc0d2234aacba6cf1a729964383e3452c52096dc695581248b548786f2b3/setproctitle-1.1.10.tar.gz
Building wheels for collected packages: SimplePool, setproctitle
  Building wheel for SimplePool (setup.py) ... [?25l[?25hdone
  Created wheel for SimplePool: filename=SimplePool-0.1-cp36-none-any.whl size=26426 sha256=24c3cd4e746e7a4d0c9454b8cb124d4abbd6eada0fc509ecb9812cf3588b48c5
  Stored in directory: /root/.cache/pip/wheels/ed/66/74/34d30b25b968ba59b2872e551dd962b883441f3772d835ff02
  Building wheel for setproctitle (setup.py) ... [?25l[?25hdone
  Created wh

In [0]:
import pandas as pd
import image_scraper 
import shutil
import os
import urllib.request

### Create and clean dataframe 

In [0]:
path = '/content/drive/My Drive/colab_notebooks/data/wikiart_info.tsv'

# create df
df = pd.read_csv(path, sep='\t')
df = df.drop(['ID', 'Artist Info URL'], axis=1) # delete stuff I don't need 

# get the name of the file and create a new column for it in the df
# and also get the right path name (with categories included)
name_list = []
paths_list = []
for url in df['Image URL']:
  index = df.loc[df['Image URL']== url].index[0] # gets the row index
  category = df['Category'].iloc[index].rsplit(',')[0] # some contain multiple categories, we only want the first
  clean_category = category.lower().replace("-", "_").replace(" ", "_")
  
  file_name = url.rsplit('/', 1)[-1] # takes filename from url
  name_list.append(file_name)
  
  image_path = '/' + clean_category + '/' + file_name 
  paths_list.append(image_path)

df['file name'] = name_list
df['path'] = paths_list

In [0]:
df[:5] # this is what the df looks like

Unnamed: 0,Category,Artist,Title,Year,Image URL,Painting Info URL,file name,path
0,Impressionism,Charles Courtney Curran,In the Luxembourg Garden,1889,https://use2-uploads3.wikiart.org/00123/images...,https://www.wikiart.org/en/charles-courtney-cu...,in-the-luxembourg-garden-1889.jpg,/impressionism/in-the-luxembourg-garden-1889.jpg
1,Neo-Expressionism,Keith Haring,The Marriage of Heaven and Hell,1984,https://use2-uploads1.wikiart.org/images/keith...,https://www.wikiart.org/en/keith-haring/the-ma...,the-marriage-of-heaven-and-hell-1984.jpg,/neo_expressionism/the-marriage-of-heaven-and-...
2,Post-Impressionism,Jozsef Rippl-Ronai,Uncle Piacsek in front of the Black Sideboard,1906,https://use2-uploads3.wikiart.org/images/j-zse...,https://www.wikiart.org/en/jozsef-rippl-ronai/...,uncle-piacsek-in-front-of-the-black-sideboard-...,/post_impressionism/uncle-piacsek-in-front-of-...
3,Cubism,Vadym Meller,Monk. For the Play &#39;Mazeppa&#39;,1920,https://use2-uploads2.wikiart.org/00124/images...,https://www.wikiart.org/en/vadym-meller/monk-f...,monk-for-the-play-mazeppa-1920.jpg,/cubism/monk-for-the-play-mazeppa-1920.jpg
4,Romanticism,David Wilkie,The Defence of Sarago&#231;a,1828,https://use2-uploads6.wikiart.org/images/david...,https://www.wikiart.org/en/david-wilkie/the-de...,the-defence-of-sarago-a.jpg,/romanticism/the-defence-of-sarago-a.jpg


In [0]:
# create a folder for each category
categories = df["Category"].unique() # gets all unique values of the Category column
path = '/content/drive/My Drive/AttnGAN/data/wikiart/images/'
for category in categories:
  first_cat = category.rsplit(',')[0] # only take the first category (some have multiple)
  clean_category = first_cat.lower().replace("-", "_").replace(" ", "_")
  cat_path = path + clean_category + '/'
  if os.path.exists(cat_path):
    continue
  os.mkdir(cat_path)

In [0]:
# move image from the download folder to the right datafolder
def move_image(image_name):
  old_path = '/content/drive/My Drive/colab_notebooks/data/paintings/' + image_name
  
  index = df.loc[df['file name']== image_name].index[0]
  image_path = df['path'].iloc[index]
  new_path = '/content/drive/My Drive/AttnGAN/data/wikiart/images' + image_path

  if os.path.exists(old_path) and not os.path.exists(new_path):
    shutil.move(old_path, new_path)

# to move all the images 
def move_images(df):
  for file in df['file name']:
    move_image(file)

In [0]:
move_image("a-pool-1935.jpg") # works!!! 

In [0]:
move_images(df)

### Scrape images

In [0]:
def scrape_image(url):
  # get rid of https://, imagescraper doesn't take that
  new_url = url[8:]

  # this says: activete image-scraper, save the image to the paintings folder
  # and take maximal 1 image from the url (this sometimes takes the wrong one
  # but idk what to do about that)
  command = "image-scraper -s paintings -m 1 " + new_url
  os.system(command)

# scrape every image from the list; this takes long! 1 min per 6-8 images for our dataset 
def scrape_all_images(image_url_list):
  for url in image_url_list:
    scrape_image(url)
    print("image scraped!", url)

In [0]:
os.chdir('/content/drive/My Drive/colab_notebooks/data')
scrape_all_images(new_urls)

In [0]:
# create new list of URLs from the images 
# that haven't been downloaded yet
def get_urls(df):
  urls = []
  data_path = '/content/drive/My Drive/colab_notebooks/data/paintings'
  
  for name in df['file name']:
    file_path = data_path + '/' + name
    
    if os.path.exists(file_path):
      continue
    index = df.loc[df['file name']== name].index[0]
    image_url = df['Painting Info URL'].iloc[index]
    urls.append(image_url)
  
  return urls

In [0]:
# Function to rename multiple files
# --> lowering their filename 
def rename_files(): 
    data_path = '/content/drive/My Drive/colab_notebooks/data/paintings'
      
    for filename in os.listdir(data_path): 
      new_filename = filename.lower()
      original_path = data_path + '/' + filename
      new_path = data_path + '/' + new_filename
      os.rename(original_path, new_path) 

In [0]:
rename_files()

In [0]:
all_urls = image_url_list = df['Painting Info URL'].tolist()
new_urls = get_urls(df)
print(len(new_urls), len(all_urls))

4119 4119


In [0]:
scrape_all_images(new_urls)

In [0]:
# some images are not properly downloaded 
# create a new df with the ones that are there.
image_df = pd.DataFrame(columns = ['Category', 'Artist', 'Title', 'Year', 'Image URL',
                                 'Painting Info URL', 'file name', 'path'])

for i in range(len(df)):
  image_path = df['path'].iloc[i]
  full_path = '/content/drive/My Drive/AttnGAN/data/wikiart/images' + image_path
  
  if os.path.exists(full_path):
    image_df = image_df.append(df.iloc[i], ignore_index=True)

image_df

Unnamed: 0,Category,Artist,Title,Year,Image URL,Painting Info URL,file name,path
0,Impressionism,Charles Courtney Curran,In the Luxembourg Garden,1889,https://use2-uploads3.wikiart.org/00123/images...,https://www.wikiart.org/en/charles-courtney-cu...,in-the-luxembourg-garden-1889.jpg,/impressionism/in-the-luxembourg-garden-1889.jpg
1,Neo-Expressionism,Keith Haring,The Marriage of Heaven and Hell,1984,https://use2-uploads1.wikiart.org/images/keith...,https://www.wikiart.org/en/keith-haring/the-ma...,the-marriage-of-heaven-and-hell-1984.jpg,/neo_expressionism/the-marriage-of-heaven-and-...
2,Post-Impressionism,Jozsef Rippl-Ronai,Uncle Piacsek in front of the Black Sideboard,1906,https://use2-uploads3.wikiart.org/images/j-zse...,https://www.wikiart.org/en/jozsef-rippl-ronai/...,uncle-piacsek-in-front-of-the-black-sideboard-...,/post_impressionism/uncle-piacsek-in-front-of-...
3,Cubism,Vadym Meller,Monk. For the Play &#39;Mazeppa&#39;,1920,https://use2-uploads2.wikiart.org/00124/images...,https://www.wikiart.org/en/vadym-meller/monk-f...,monk-for-the-play-mazeppa-1920.jpg,/cubism/monk-for-the-play-mazeppa-1920.jpg
4,Romanticism,David Wilkie,The Defence of Sarago&#231;a,1828,https://use2-uploads6.wikiart.org/images/david...,https://www.wikiart.org/en/david-wilkie/the-de...,the-defence-of-sarago-a.jpg,/romanticism/the-defence-of-sarago-a.jpg
...,...,...,...,...,...,...,...,...
3889,Color Field Painting,Rupprecht Geiger,OE 260,1957,https://use2-uploads7.wikiart.org/images/ruppr...,https://www.wikiart.org/en/rupprecht-geiger/oe...,oe-260-1957.jpg,/color_field_painting/oe-260-1957.jpg
3890,Surrealism,Oscar Dominguez,M&#225;quina de coser electro-sexual,1934,https://use2-uploads4.wikiart.org/images/oscar...,https://www.wikiart.org/en/oscar-dominguez/m-q...,m-quina-de-coser-electro-sexual-1934.jpg,/surrealism/m-quina-de-coser-electro-sexual-19...
3891,Neo-Expressionism,Georg Baselitz,Female Nude on a Kitchen Chair,1979,https://use2-uploads2.wikiart.org/images/georg...,https://www.wikiart.org/en/georg-baselitz/fema...,female-nude-on-a-kitchen-chair-1979.jpg,/neo_expressionism/female-nude-on-a-kitchen-ch...
3892,Expressionism,Marie Laurencin,Apollinaire and His Friends,1909,https://use2-uploads0.wikiart.org/images/marie...,https://www.wikiart.org/en/marie-laurencin/apo...,apollinaire-and-his-friends-1909.jpg,/expressionism/apollinaire-and-his-friends-190...


In [0]:
%cd '/content/drive/My Drive/AttnGAN/data/wikiart'

/content/drive/My Drive/AttnGAN/data/wikiart


In [0]:
image_df.to_csv('wikiartinfo.csv', index=False)

### Pickling filenames
Now we have to create a `.pickle` file that contains the path to all images from 
`/content/drive/My Drive/AttnGAN/data/wikiart/images`.


In [0]:
import pickle
import random 
import pandas as pd

In [0]:
image_df = pd.read_csv('/content/drive/My Drive/AttnGAN/data/wikiart/wikiartinfo.csv')
image_df

Unnamed: 0,Category,Artist,Title,Year,Image URL,Painting Info URL,file name,path
0,Impressionism,Charles Courtney Curran,In the Luxembourg Garden,1889,https://use2-uploads3.wikiart.org/00123/images...,https://www.wikiart.org/en/charles-courtney-cu...,in-the-luxembourg-garden-1889.jpg,/impressionism/in-the-luxembourg-garden-1889.jpg
1,Neo-Expressionism,Keith Haring,The Marriage of Heaven and Hell,1984,https://use2-uploads1.wikiart.org/images/keith...,https://www.wikiart.org/en/keith-haring/the-ma...,the-marriage-of-heaven-and-hell-1984.jpg,/neo_expressionism/the-marriage-of-heaven-and-...
2,Post-Impressionism,Jozsef Rippl-Ronai,Uncle Piacsek in front of the Black Sideboard,1906,https://use2-uploads3.wikiart.org/images/j-zse...,https://www.wikiart.org/en/jozsef-rippl-ronai/...,uncle-piacsek-in-front-of-the-black-sideboard-...,/post_impressionism/uncle-piacsek-in-front-of-...
3,Cubism,Vadym Meller,Monk. For the Play &#39;Mazeppa&#39;,1920,https://use2-uploads2.wikiart.org/00124/images...,https://www.wikiart.org/en/vadym-meller/monk-f...,monk-for-the-play-mazeppa-1920.jpg,/cubism/monk-for-the-play-mazeppa-1920.jpg
4,Romanticism,David Wilkie,The Defence of Sarago&#231;a,1828,https://use2-uploads6.wikiart.org/images/david...,https://www.wikiart.org/en/david-wilkie/the-de...,the-defence-of-sarago-a.jpg,/romanticism/the-defence-of-sarago-a.jpg
...,...,...,...,...,...,...,...,...
3889,Color Field Painting,Rupprecht Geiger,OE 260,1957,https://use2-uploads7.wikiart.org/images/ruppr...,https://www.wikiart.org/en/rupprecht-geiger/oe...,oe-260-1957.jpg,/color_field_painting/oe-260-1957.jpg
3890,Surrealism,Oscar Dominguez,M&#225;quina de coser electro-sexual,1934,https://use2-uploads4.wikiart.org/images/oscar...,https://www.wikiart.org/en/oscar-dominguez/m-q...,m-quina-de-coser-electro-sexual-1934.jpg,/surrealism/m-quina-de-coser-electro-sexual-19...
3891,Neo-Expressionism,Georg Baselitz,Female Nude on a Kitchen Chair,1979,https://use2-uploads2.wikiart.org/images/georg...,https://www.wikiart.org/en/georg-baselitz/fema...,female-nude-on-a-kitchen-chair-1979.jpg,/neo_expressionism/female-nude-on-a-kitchen-ch...
3892,Expressionism,Marie Laurencin,Apollinaire and His Friends,1909,https://use2-uploads0.wikiart.org/images/marie...,https://www.wikiart.org/en/marie-laurencin/apo...,apollinaire-and-his-friends-1909.jpg,/expressionism/apollinaire-and-his-friends-190...


In [0]:
paths = image_df['path'].tolist()
print(len(paths))

3894


In [0]:
# some image names are not saved as '.jpg', let's find out which
for path in image_df['path'].tolist():
  if path[-4:] != '.jpg':
    print(path)

/neoclassicism/1814-koch-noahs-dankopfer-anagoria.JPG
/neoclassicism/vernet-claude-joseph-the-night-18th-c-1.JPG
/magic_realism/lethargic-dream-59x80-cmxcm-2002.JPG
/neoclassicism/1804-koch-der-tod-des-oskar-anagoria.JPG
/neoclassicism/1833-koch-serpentaralandschaft-anagoria.JPG
/cubism/epstein-rockdrill.png
/neoclassicism/first-consul-bonaparte-1802.png
/expressionism/otto-mueller-t-nzerin-mit-schleier-von-einem-mann-beobachtet-ca1903.jpeg
/northern_renaissance/quentin-metsys-madonna-col-bambino-1510-25-ca-01.JPG
/high_renaissance/flora-1520.JPG
/cubism/likbez-1920c-1.JPG
/expressionism/otto-mueller-bauerngeh-ft-in-frankreich-ca1916.jpeg
/impressionism/meditation-1889.jpeg


In [0]:
paths = image_df['path'].tolist()
new_paths = []
for path in paths:
  if (path[-5:] == '.jpeg'): # one way or another it doesn't take it when i say new_path = path[:-5]... 
    continue
  if (path[-4:] == '.png'):
    new_path = path[:-4] 
  if (path[-4:] == '.JPG'):
    new_path = path[:-4] 
  else:
    new_path = path.strip('/')[:-4] # the pickle file doesn't need '.jpg'
  new_paths.append(new_path)

random.shuffle(new_paths)

# 80% = 3115 (train), 20% = 779 (test)
train_paths = new_paths[:3115]
test_paths = new_paths[3115:]

with open('/content/drive/My Drive/AttnGAN/data/wikiart/train/filenames.pickle', 'wb') as f:
  pickle.dump(train_paths, f)

with open('/content/drive/My Drive/AttnGAN/data/wikiart/test/filenames.pickle', 'wb') as f:
  pickle.dump(test_paths, f)

### Prepping and creating `.txt` files for image captions

In [0]:
import re

In [0]:
s = image_df['Title'].iloc[4]

In [0]:
re.sub('&#(\d+);','', s )

'The Defence of Saragoa'

In [0]:
titles = image_df['Title'].tolist()
for i, title in enumerate(titles):
  clean_title = re.sub('&#(\d+);','', title.lower())
  cleaner_title = re.sub('[\(\)]','', clean_title)
  cleanest_title = re.sub('-',' ', cleaner_title)
  titles[i] = cleanest_title

image_df['clean title'] = titles

In [0]:
# create a folder for each category
categories = image_df["Category"].unique() # gets all unique values of the Category column
path = '/content/drive/My Drive/AttnGAN/data/wikiart/text/'
for category in categories:
  first_cat = category.rsplit(',')[0] # only take the first category (some have multiple)
  clean_category = first_cat.lower().replace("-", "_").replace(" ", "_")
  cat_path = path + clean_category + '/'
  if os.path.exists(cat_path):
    continue
  os.mkdir(cat_path)

In [0]:
for i in range(len(image_df)):
  image_path = image_df['path'].iloc[i]

  text_filename = image_path.split('/')[2][:-4] + '.txt' # splits in '', folder, filename
  folder_path = '/content/drive/My Drive/AttnGAN/data/wikiart/text/' + image_path.split('/')[1]
  os.chdir(folder_path)

  file = open(text_filename, "w") 
  file.write(image_df['clean title'].iloc[i]) 
  file.close() 

In [0]:
%cd '/content/drive/My Drive/AttnGAN/data/wikiart'

/content/drive/My Drive/AttnGAN/data/wikiart


In [0]:
example_captions = ["a ballet class", "the plague", "a woman in a fight", 
                    "an illness", "abstract 2020", "a group of people", 
                    "something sad with bright colors", "abstract piece", 
                    "a painting"]

with open('example_captions.txt', 'w') as f:
    for item in example_captions:
        f.write("%s\n" % item)
    f.close()

# to specify the file in which example captions can be found
with open('example_filenames.txt', 'w') as f:
    f.write("example_captions")
    f.close()

### Renaming files 
some files need to be renamed because they are saved in a different format than `.jpg`, but we want all of the files to be in that format.

In [0]:
import os

In [0]:
def rename_files_new(): 
    data_path = '/content/drive/My Drive/AttnGAN/data/wikiart/images'
      
    for folder_name in os.listdir(data_path): 
      folder_path = data_path + '/' + folder_name
      for image_name in os.listdir(folder_path):
        if (image_name[-4:] != '.jpg'):
          print(image_name)
          new_image_name = image_name[:-5] + ".jpg"
          original_path = folder_path + '/' + image_name
          new_path = folder_path + '/' + new_image_name
          os.rename(original_path, new_path) 
        else:
          continue

In [0]:
rename_files_new() # for renaming JPG to jpg

likbez-1920c-1.JPG
1814-koch-noahs-dankopfer-anagoria.JPG
vernet-claude-joseph-the-night-18th-c-1.JPG
1833-koch-serpentaralandschaft-anagoria.JPG
1804-koch-der-tod-des-oskar-anagoria.JPG
quentin-metsys-madonna-col-bambino-1510-25-ca-01.JPG
flora-1520.JPG


In [0]:
rename_files_new() # for renaming png to jpg

epstein-rockdrill.png


In [0]:
rename_files_new() # for renaming jpeg to jpg

meditation-1889.jpeg
otto-mueller-t-nzerin-mit-schleier-von-einem-mann-beobachtet-ca1903.jpeg
otto-mueller-bauerngeh-ft-in-frankreich-ca1916.jpeg


In [0]:
data_path = '/content/drive/My Drive/AttnGAN/data/wikiart/images'
      
for folder_name in os.listdir(data_path): 
  folder_path = data_path + '/' + folder_name
  for image_name in os.listdir(folder_path):
    if (image_name[-4:] != '.jpg'):
      print(image_name)

In [0]:
import pandas as pd
image_df = pd.read_csv('/content/drive/My Drive/AttnGAN/data/wikiart/wikiartinfo.csv')



/neoclassicism/1814-koch-noahs-dankopfer-anagoria.JPG
/neoclassicism/vernet-claude-joseph-the-night-18th-c-1.JPG
/magic_realism/lethargic-dream-59x80-cmxcm-2002.JPG
/neoclassicism/1804-koch-der-tod-des-oskar-anagoria.JPG
/neoclassicism/1833-koch-serpentaralandschaft-anagoria.JPG
/cubism/epstein-rockdrill.png
/neoclassicism/first-consul-bonaparte-1802.png
/expressionism/otto-mueller-t-nzerin-mit-schleier-von-einem-mann-beobachtet-ca1903.jpeg
/northern_renaissance/quentin-metsys-madonna-col-bambino-1510-25-ca-01.JPG
/high_renaissance/flora-1520.JPG
/cubism/likbez-1920c-1.JPG
/expressionism/otto-mueller-bauerngeh-ft-in-frankreich-ca1916.jpeg
/impressionism/meditation-1889.jpeg
