In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
artworks = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv')

In [3]:
artworks.columns

Index(['Title', 'Artist', 'ConstituentID', 'ArtistBio', 'Nationality',
       'BeginDate', 'EndDate', 'Gender', 'Date', 'Medium', 'Dimensions',
       'CreditLine', 'AccessionNumber', 'Classification', 'Department',
       'DateAcquired', 'Cataloged', 'ObjectID', 'URL', 'ThumbnailURL',
       'Circumference (cm)', 'Depth (cm)', 'Diameter (cm)', 'Height (cm)',
       'Length (cm)', 'Weight (kg)', 'Width (cm)', 'Seat Height (cm)',
       'Duration (sec.)'],
      dtype='object')

In [4]:
# Select Columns.
artworks = artworks[['Artist', 'Nationality', 'Gender', 'Date', 'Department',
                    'DateAcquired', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)']]

# Convert URL's to booleans.
artworks['URL'] = artworks['URL'].notnull()
artworks['ThumbnailURL'] = artworks['ThumbnailURL'].notnull()

# Drop films and some other tricky rows.
artworks = artworks[artworks['Department']!='Film']
artworks = artworks[artworks['Department']!='Media and Performance Art']
artworks = artworks[artworks['Department']!='Fluxus Collection']

# Drop missing data.
artworks = artworks.dropna()

In [5]:
artworks['DateAcquired'] = pd.to_datetime(artworks.DateAcquired)
artworks['YearAcquired'] = artworks.DateAcquired.dt.year
artworks['YearAcquired'].dtype

dtype('int64')

In [6]:
# Remove multiple nationalities, genders, and artists.
artworks.loc[artworks['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons\)'
artworks.loc[artworks['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
artworks.loc[artworks['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'

# Convert dates to start date, cutting down number of distinct examples.
artworks['Date'] = pd.Series(artworks.Date.str.extract(
    '([0-9]{4})', expand=False))[:-1]

# Final column drops and NA drop.
X = artworks.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)

# Create dummies separately.
artists = pd.get_dummies(artworks.Artist)
nationalities = pd.get_dummies(artworks.Nationality)
dates = pd.get_dummies(artworks.Date)

# Concat with other variables, but artists slows this wayyyyy down so we'll keep it out for now
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)

Y = artworks.Department

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.75, random_state=34)
len(X_train)

27109

In [10]:
mlp = MLPClassifier(hidden_layer_sizes=(1000,))
mlp.fit(X_train, Y_train)

0.7889446309343761

In [11]:
mlp.score(X_train, Y_train)

0.7943302962115902

In [12]:
Y_train.value_counts()/len(Y_train)

Drawings & Prints        0.621196
Photography              0.225091
Architecture & Design    0.114077
Painting & Sculpture     0.035173
Media and Performance    0.004463
Name: Department, dtype: float64

In [13]:
cross_val_score(mlp, X_train, Y_train, cv=5)



array([0.77455048, 0.77768557, 0.57677764, 0.74269114, 0.785095  ])

In [15]:
mlp = MLPClassifier(hidden_layer_sizes=(500, 250, 100,))
mlp.fit(X_train, Y_train)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(500, 250, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [16]:
mlp.score(X_train, Y_train)

0.8177358072964698

In [17]:
cross_val_score(mlp, X_train, Y_train, cv=5)



array([0.77820796, 0.66463864, 0.7286979 , 0.76826568, 0.76028788])

The overfit issues from cross_val are likely due to the smaller size of the dataset for computational speed. Increasing the hidden levels definitely improved the score of the model. Let's keep the hidden levels the same and add some early stopping criteria to fight the overfit.

In [18]:
mlp = MLPClassifier(
    hidden_layer_sizes=(500, 250, 100,), 
    learning_rate_init=0.01,
    n_iter_no_change=5,
    tol=0.01
)
mlp.fit(X_train, Y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(500, 250, 100), learning_rate='constant',
       learning_rate_init=0.01, max_iter=200, momentum=0.9,
       n_iter_no_change=5, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.01,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [19]:
mlp.score(X_train, Y_train)

0.7184698808513778

In [20]:
cross_val_score(mlp, X_train, Y_train, cv=5)

array([0.72713864, 0.71662979, 0.7161564 , 0.6900369 , 0.74201882])

Overfit is nowhere near as bad as before, now that the model runs faster, lets try bumping up the hidden layers and drop the learning rate and tol some.

In [21]:
mlp = MLPClassifier(
    hidden_layer_sizes=(750, 500, 200, 100,), 
    learning_rate_init=0.001,
    n_iter_no_change=7,
    tol=0.001
)
mlp.fit(X_train, Y_train)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(750, 500, 200, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=7, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [22]:
mlp.score(X_train, Y_train)

0.8511195543915305

In [23]:
cross_val_score(mlp, X_train, Y_train, cv=5)



array([0.78078909, 0.79941003, 0.77037993, 0.75332103, 0.76047241])

Wow, huge improvement in the r^2 (which means nothing in a classification model) and very little overfit issues in the CV scores. There are some discrepancies between the train score and the CV scores that might indicate that overfit is still an issue, but given the data size and the consistency in the CV scores I am willing to bet it is just a volume issue.