In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
artworks = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv')
artworks.columns

Index(['Title', 'Artist', 'ConstituentID', 'ArtistBio', 'Nationality',
       'BeginDate', 'EndDate', 'Gender', 'Date', 'Medium', 'Dimensions',
       'CreditLine', 'AccessionNumber', 'Classification', 'Department',
       'DateAcquired', 'Cataloged', 'ObjectID', 'URL', 'ThumbnailURL',
       'Circumference (cm)', 'Depth (cm)', 'Diameter (cm)', 'Height (cm)',
       'Length (cm)', 'Weight (kg)', 'Width (cm)', 'Seat Height (cm)',
       'Duration (sec.)'],
      dtype='object')

In [3]:
#preprocessing/cleaning

#select columns of interest
artworks = artworks[['Artist', 'Nationality', 'Gender', 'Date', 'Department',
                     'DateAcquired', 'URL', 'ThumbnailURL', 'Height (cm)',
                     'Width (cm)']]

#convert urls to booleans indicating if they are present
artworks['URL'] = artworks['URL'].notnull()
artworks['ThumbnailURL'] = artworks['ThumbnailURL'].notnull()

#drop films and other tricky rows
artworks = artworks[artworks['Department'] != 'Film']
artworks = artworks[artworks['Department'] != 'Media and Performance Art']
artworks = artworks[artworks['Department'] != 'Fluxus Collection']

#drop missing data
artworks = artworks.dropna()

artworks.head()

Unnamed: 0,Artist,Nationality,Gender,Date,Department,DateAcquired,URL,ThumbnailURL,Height (cm),Width (cm)
0,Otto Wagner,(Austrian),(Male),1896,Architecture & Design,1996-04-09,True,True,48.6,168.9
1,Christian de Portzamparc,(French),(Male),1987,Architecture & Design,1995-01-17,True,True,40.6401,29.8451
2,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,34.3,31.8
3,Bernard Tschumi,(),(Male),1980,Architecture & Design,1995-01-17,True,True,50.8,50.8
4,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,38.4,19.1


## Building a model
Use multi-layer perceptron modeling (MLP) to classify the department a piece should go into using everything except the department name

In [4]:
#check data types before feeding to sklearn MLP
artworks.dtypes

Artist           object
Nationality      object
Gender           object
Date             object
Department       object
DateAcquired     object
URL                bool
ThumbnailURL       bool
Height (cm)     float64
Width (cm)      float64
dtype: object

In [5]:
#transform DateAcquired from object to datetime object
artworks['DateAcquired'] = pd.to_datetime(artworks.DateAcquired)

#add feature for just the year artwork acquired
artworks['YearAcquired'] = artworks.DateAcquired.dt.year
artworks['YearAcquired'].dtype

dtype('int64')

In [6]:
#more misc cleaning
#remove multiple nationalities, genders, artists
artworks.loc[artworks['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons)'
artworks.loc[artworks['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
artworks.loc[artworks['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'

#convert dates to start date, cuts down number of distinct examples
artworks['Date'] = pd.Series(artworks.Date.str.extract(
    '([0-9]{4})', expand=False))[:-1]

#final column/NA drops
X = artworks.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)

#create dummies separately
artists = pd.get_dummies(artworks.Artist)
nationalities = pd.get_dummies(artworks.Nationality)
dates = pd.get_dummies(artworks.Date)

#concat with other variables (artists left out for now since it slows everything down)
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)

#set target
y = artworks.Department

In [7]:
#modeling time, NNs are very computationally intensive and may take a few minutes
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(1000,))
mlp.fit(X,y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(1000,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [9]:
mlp.score(X,y)

0.6808563155646271

In [8]:
y.value_counts()/len(y)

Prints & Illustrated Books    0.523140
Photography                   0.228737
Architecture & Design         0.111131
Drawings                      0.103081
Painting & Sculpture          0.033911
Name: Department, dtype: float64

In [9]:
from sklearn.model_selection import cross_val_score
cross_val_score(mlp, X, y, cv=5)

array([0.55992216, 0.26243592, 0.37836427, 0.53256432, 0.43769286])

**Results:**
- Model is overfitting
    - NNs that aren't given enough data for the number of features tend to overfit
    - (In general) NNs like *a lot* of data
- NNs can take a long time to run
    - Increase layer size -> increase run time
- Adding artists booleans to this model would exacerbate both the overfitting & runtime issues

## Model parameters

- hidden_layer_sizes: how many and how large to make layers
    - (1000,): 1000 neurons wide, one layer (this model)
    - (100, 4,): two layers, one 100 neurons wide, the other 4 neurons wide
    - Choosing layer size:
        - Determined by computational resources and cross validation
        - Generally less than the number of input variables
- alpha: scales regularization parameter penalty for large coefficients (just like advanced regression models)
- activation: function that determines whether the output from an individual perceptron is binary or continuous
    - relu: default, rectified linear unit function, binary
    - logistic: sklearn terminology for logistic sigmoid function, allows for continuous variables ranging 0 to 1


## Drill: playing with layers
Experiment with different hidden layer structures, using a subset of the data to improve runtime. See how things vary between layer structures. Can also play with other parameters & feature selection

In [30]:
from sklearn.model_selection import train_test_split
import time
import warnings
warnings.filterwarnings('ignore')

#train on 25% of data to improve train times
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.25, random_state=111)
print(len(X), len(X_train))

105335 26333


In [31]:
start_time = time.clock()

mlp = MLPClassifier(hidden_layer_sizes=(1000,))
mlp.fit(X_train, y_train)

print('hidden_layer_sizes=(1000,)')
print('cv=5 mean: {}'.format(cross_val_score(mlp, X_train, y_train, cv=5).mean()))
print('runtime: {}'.format(time.clock() - start_time))

hidden_layer_sizes=(1000,)
cv=5 mean: 0.5761563551441675
runtime: 168.701959


In [32]:
#change layer structure
start_time = time.clock()

mlp = MLPClassifier(hidden_layer_sizes=(500, 500,)).fit(X_train, y_train)

print('hidden_layer_sizes=(500,500,)')
print('cv=5 mean: {}'.format(cross_val_score(mlp, X_train, y_train, cv=5).mean()))
print('runtime: {}'.format(time.clock() - start_time))

hidden_layer_sizes=(500,500,)
cv=5 mean: 0.6094210611753574
runtime: 383.9069440000003


In [33]:
start_time = time.clock()

mlp = MLPClassifier(hidden_layer_sizes=(250, 500, 750,)).fit(X_train, y_train)

print('hidden_layer_sizes=(250, 500, 750,)')
print('cv=5 mean: {}'.format(cross_val_score(mlp, X_train, y_train, cv=5).mean()))
print('runtime: {}'.format(time.clock() - start_time))

hidden_layer_sizes=(250, 500, 750,)
cv=5 mean: 0.6245367338336771
runtime: 732.4709219999995


- Adding layers increases runtime but also model performance

In [35]:
#change activation function
start_time = time.clock()

mlp = MLPClassifier(hidden_layer_sizes=(1000,), activation='logistic')
mlp.fit(X_train, y_train)

print('hidden_layer_sizes=(1000,), activation=logistic')
print('cv=5 mean: {}'.format(cross_val_score(mlp, X_train, y_train, cv=5).mean()))
print('runtime: {}'.format(time.clock() - start_time))

hidden_layer_sizes=(1000,), activation=logistic


KeyboardInterrupt: 

- I don't have time for this to complete, very slow