# Curriculum

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# Import the model.
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

In [2]:
artworks = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv')
artworks.head(1)

Unnamed: 0,Title,Artist,ConstituentID,ArtistBio,Nationality,BeginDate,EndDate,Gender,Date,Medium,...,ThumbnailURL,Circumference (cm),Depth (cm),Diameter (cm),Height (cm),Length (cm),Weight (kg),Width (cm),Seat Height (cm),Duration (sec.)
0,"Ferdinandsbrücke Project, Vienna, Austria, Ele...",Otto Wagner,6210,"(Austrian, 1841–1918)",(Austrian),(1841),(1918),(Male),1896,Ink and cut-and-pasted painted pages on paper,...,http://www.moma.org/media/W1siZiIsIjU5NDA1Il0s...,,,,48.6,,,168.9,,


In [4]:
artworks.columns

Index(['Title', 'Artist', 'ConstituentID', 'ArtistBio', 'Nationality',
       'BeginDate', 'EndDate', 'Gender', 'Date', 'Medium', 'Dimensions',
       'CreditLine', 'AccessionNumber', 'Classification', 'Department',
       'DateAcquired', 'Cataloged', 'ObjectID', 'URL', 'ThumbnailURL',
       'Circumference (cm)', 'Depth (cm)', 'Diameter (cm)', 'Height (cm)',
       'Length (cm)', 'Weight (kg)', 'Width (cm)', 'Seat Height (cm)',
       'Duration (sec.)'],
      dtype='object')

In [3]:
# Select Columns.
artworks = artworks[['Artist', 'Nationality', 'Gender', 'Date', 'Department',
                    'DateAcquired', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)']]

# Convert URL's to booleans.
artworks['URL'] = artworks['URL'].notnull()
artworks['ThumbnailURL'] = artworks['ThumbnailURL'].notnull()

# Drop films and some other tricky rows.
artworks = artworks[artworks['Department']!='Film']
artworks = artworks[artworks['Department']!='Media and Performance Art']
artworks = artworks[artworks['Department']!='Fluxus Collection']

# Drop missing data.
artworks = artworks.dropna()

In [6]:
print(artworks.shape)
artworks.head()

(106596, 10)


Unnamed: 0,Artist,Nationality,Gender,Date,Department,DateAcquired,URL,ThumbnailURL,Height (cm),Width (cm)
0,Otto Wagner,(Austrian),(Male),1896,Architecture & Design,1996-04-09,True,True,48.6,168.9
1,Christian de Portzamparc,(French),(Male),1987,Architecture & Design,1995-01-17,True,True,40.6401,29.8451
2,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,34.3,31.8
3,Bernard Tschumi,(),(Male),1980,Architecture & Design,1995-01-17,True,True,50.8,50.8
4,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,38.4,19.1


In [7]:

artworks.dtypes

Artist           object
Nationality      object
Gender           object
Date             object
Department       object
DateAcquired     object
URL                bool
ThumbnailURL       bool
Height (cm)     float64
Width (cm)      float64
dtype: object

In [4]:

artworks['DateAcquired'] = pd.to_datetime(artworks.DateAcquired)
artworks['YearAcquired'] = artworks.DateAcquired.dt.year
artworks['YearAcquired'].dtype

dtype('int64')

In [5]:
# Remove multiple nationalities, genders, and artists.
artworks.loc[artworks['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons\)'
artworks.loc[artworks['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
artworks.loc[artworks['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'

# Convert dates to start date, cutting down number of distinct examples.
artworks['Date'] = pd.Series(artworks.Date.str.extract(
    '([0-9]{4})', expand=False))[:-1]


artworks2 = artworks.sample(5000)
# Final column drops and NA drop.
X = artworks2.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)

In [6]:
# Create dummies separately.
print('original',artworks2.shape)
#artists = pd.get_dummies(artworks.Artist)
#print('artists',artists.shape)
nationalities = pd.get_dummies(artworks2.Nationality)
print('nationalities',nationalities.shape)
dates = pd.get_dummies(artworks2.Date)
print('dates',dates.shape)

original (5000, 11)
nationalities (5000, 70)
dates (5000, 172)


In [7]:
# Concat with other variables, but artists slows this wayyyyy down so we'll keep it out for now
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)

Y = artworks2.Department

In [12]:
# Alright! We've done our prep, let's build the model.
# Neural networks are hugely computationally intensive.
# This may take several minutes to run.

# Establish and fit the model, with a single, 1000 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(1000,))
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(1000,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [13]:
mlp.score(X, Y)

0.6658

In [14]:
Y.value_counts()/len(Y)

Drawings & Prints        0.6312
Photography              0.2310
Architecture & Design    0.1046
Painting & Sculpture     0.0332
Name: Department, dtype: float64

In [15]:

cross_val_score(mlp, X, Y, cv=5)

array([0.48702595, 0.691     , 0.635     , 0.34434434, 0.24524525])

# DRILL
Now it's your turn. Using the space below, experiment with different hidden layer structures. You can try this on a subset of the data to improve runtime. See how things vary. See what seems to matter the most. Feel free to manipulate other parameters as well. It may also be beneficial to do some real feature selection work...

**Try 2 Layers**

In [16]:
mlp = MLPClassifier(hidden_layer_sizes=(200,2))
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(200, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [17]:
mlp.score(X, Y)

0.6312

In [18]:
cross_val_score(mlp, X, Y, cv=5)



array([0.63073852, 0.631     , 0.631     , 0.63163163, 0.63163163])

**A second layer appears to have eliminated overfitting without much accuracy loss! Let's see what a 3rd does...Trying 3 layers**

In [19]:
mlp = MLPClassifier(hidden_layer_sizes=(100,3))
mlp.fit(X, Y)

mlp.score(X, Y)

0.6312

In [20]:
cross_val_score(mlp, X, Y, cv=5)

array([0.63473054, 0.63      , 0.631     , 0.63163163, 0.63163163])

**No real change. Let's see if we can reduce the layers to 100 with just 2 layers for performance reasons**

In [21]:
mlp = MLPClassifier(hidden_layer_sizes=(100,2))
mlp.fit(X, Y)

mlp.score(X, Y)

0.6458

In [22]:
cross_val_score(mlp, X, Y, cv=5)



array([0.63073852, 0.63      , 0.631     , 0.63163163, 0.63163163])

**Nice! Let's resample a couple times to confirm results:**

In [27]:
artworks3 = artworks.sample(5000)
# Final column drops and NA drop.
X = artworks3.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)
# Create dummies separately.
nationalities = pd.get_dummies(artworks3.Nationality)
dates = pd.get_dummies(artworks3.Date)
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)
Y = artworks3.Department

mlp = MLPClassifier(hidden_layer_sizes=(100,2))
mlp.fit(X, Y)
cross_val_score(mlp, X, Y, cv=5)



array([0.61876248, 0.61938062, 0.619     , 0.61961962, 0.62024048])

In [28]:
artworks4 = artworks.sample(5000)
# Final column drops and NA drop.
X = artworks4.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)
# Create dummies separately.
nationalities = pd.get_dummies(artworks4.Nationality)
dates = pd.get_dummies(artworks4.Date)
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)
Y = artworks4.Department

mlp = MLPClassifier(hidden_layer_sizes=(100,2))
mlp.fit(X, Y)
cross_val_score(mlp, X, Y, cv=5)

array([0.63036963, 0.63036963, 0.63036963, 0.63063063, 0.63126253])

**How small can we make the 2 layers without losing accuracy?**

In [29]:
mlp = MLPClassifier(hidden_layer_sizes=(50,2))
mlp.fit(X, Y)
cross_val_score(mlp, X, Y, cv=5)

array([0.63036963, 0.63036963, 0.63036963, 0.11411411, 0.63126253])

**This sample is giving weird results, resampling...**

In [30]:
artworks2 = artworks.sample(5000)
X = artworks2.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)
nationalities = pd.get_dummies(artworks2.Nationality)
dates = pd.get_dummies(artworks2.Date)
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)
Y = artworks2.Department

In [43]:
mlp = MLPClassifier(hidden_layer_sizes=(50,2), max_iter=350)
mlp.fit(X, Y)
cross_val_score(mlp, X, Y, cv=5)

array([0.61277445, 0.61438561, 0.614     , 0.61361361, 0.61422846])

**What about MUCH smaller, but more layers?**

In [41]:
mlp = MLPClassifier(hidden_layer_sizes=(8,8), max_iter=500)
mlp.fit(X, Y)
cross_val_score(mlp, X, Y, cv=5)

array([0.61277445, 0.68731269, 0.692     , 0.5025025 , 0.65330661])

In [42]:
mlp = MLPClassifier(hidden_layer_sizes=(25,8), max_iter=500)
mlp.fit(X, Y)
cross_val_score(mlp, X, Y, cv=5)

array([0.66966068, 0.66833167, 0.683     , 0.25025025, 0.57915832])

**Too unstable. Let's turn on verbose=True to see what happens**

In [10]:
mlp = MLPClassifier(hidden_layer_sizes=(8,8), max_iter=500, verbose=True)
print('created')
mlp.fit(X, Y)
print('fit')
cross_val_score(mlp, X, Y, cv=5)

created
Iteration 1, loss = 17.80359561
Iteration 2, loss = 17.80359505
Iteration 3, loss = 17.80359460
Iteration 4, loss = 17.54715778
Iteration 5, loss = 17.99871312
Iteration 6, loss = 17.40777264
Iteration 7, loss = 12.27555220
Iteration 8, loss = 1.66847377
Iteration 9, loss = 1.09826947
Iteration 10, loss = 1.03886173
Iteration 11, loss = 1.02341817
Iteration 12, loss = 1.00898422
Iteration 13, loss = 1.00068186
Iteration 14, loss = 0.99207635
Iteration 15, loss = 0.98119278
Iteration 16, loss = 0.97103940
Iteration 17, loss = 0.96150480
Iteration 18, loss = 0.95208843
Iteration 19, loss = 0.94716782
Iteration 20, loss = 0.93988702
Iteration 21, loss = 0.93142884
Iteration 22, loss = 0.92736001
Iteration 23, loss = 0.92200937
Iteration 24, loss = 0.91582224
Iteration 25, loss = 0.91010329
Iteration 26, loss = 0.90501819
Iteration 27, loss = 0.90047206
Iteration 28, loss = 0.89733367
Iteration 29, loss = 0.89239783
Iteration 30, loss = 0.89330933
Iteration 31, loss = 0.88159647
It

Iteration 103, loss = 0.81026926
Iteration 104, loss = 0.80870836
Iteration 105, loss = 0.80443662
Iteration 106, loss = 0.80183290
Iteration 107, loss = 0.80750475
Iteration 108, loss = 0.81248157
Iteration 109, loss = 0.80933116
Iteration 110, loss = 0.81249982
Iteration 111, loss = 0.79866980
Iteration 112, loss = 0.79844086
Iteration 113, loss = 0.80057580
Iteration 114, loss = 0.79669759
Iteration 115, loss = 0.79453468
Iteration 116, loss = 0.80185902
Iteration 117, loss = 0.79508272
Iteration 118, loss = 0.78893816
Iteration 119, loss = 0.78931419
Iteration 120, loss = 0.79834317
Iteration 121, loss = 0.78849736
Iteration 122, loss = 0.78947967
Iteration 123, loss = 0.79528117
Iteration 124, loss = 0.78811310
Iteration 125, loss = 0.79352199
Iteration 126, loss = 0.78078712
Iteration 127, loss = 0.78086842
Iteration 128, loss = 0.78967895
Iteration 129, loss = 0.78033995
Iteration 130, loss = 0.78461804
Iteration 131, loss = 0.78276403
Iteration 132, loss = 0.78043329
Iteration 

Iteration 155, loss = 0.71145533
Iteration 156, loss = 0.71192260
Iteration 157, loss = 0.71433159
Iteration 158, loss = 0.71418930
Iteration 159, loss = 0.71499508
Iteration 160, loss = 0.71314517
Iteration 161, loss = 0.71447063
Iteration 162, loss = 0.71375328
Iteration 163, loss = 0.70520234
Iteration 164, loss = 0.70579422
Iteration 165, loss = 0.70892910
Iteration 166, loss = 0.70961909
Iteration 167, loss = 0.70466425
Iteration 168, loss = 0.71005785
Iteration 169, loss = 0.70795287
Iteration 170, loss = 0.73801175
Iteration 171, loss = 0.71695357
Iteration 172, loss = 0.72261291
Iteration 173, loss = 0.72101799
Iteration 174, loss = 0.71077092
Iteration 175, loss = 0.70647933
Iteration 176, loss = 0.70181342
Iteration 177, loss = 0.72263026
Iteration 178, loss = 0.72363086
Iteration 179, loss = 0.72731968
Iteration 180, loss = 0.70355632
Iteration 181, loss = 0.71409505
Iteration 182, loss = 0.72808870
Iteration 183, loss = 0.71016430
Iteration 184, loss = 0.72074484
Iteration 

Iteration 28, loss = 0.90274028
Iteration 29, loss = 0.89865436
Iteration 30, loss = 0.91678169
Iteration 31, loss = 0.89737250
Iteration 32, loss = 0.89848331
Iteration 33, loss = 0.88972957
Iteration 34, loss = 0.88904898
Iteration 35, loss = 0.91738952
Iteration 36, loss = 0.91214452
Iteration 37, loss = 0.88736542
Iteration 38, loss = 0.88752816
Iteration 39, loss = 0.87793416
Iteration 40, loss = 0.88038542
Iteration 41, loss = 0.87831487
Iteration 42, loss = 0.87148343
Iteration 43, loss = 0.87952958
Iteration 44, loss = 0.89480215
Iteration 45, loss = 0.88222501
Iteration 46, loss = 0.87370242
Iteration 47, loss = 0.88502627
Iteration 48, loss = 0.87028723
Iteration 49, loss = 0.92934234
Iteration 50, loss = 0.86920658
Iteration 51, loss = 0.86819782
Iteration 52, loss = 0.88101116
Iteration 53, loss = 0.86053869
Iteration 54, loss = 0.89353563
Iteration 55, loss = 0.86895730
Iteration 56, loss = 0.86786042
Iteration 57, loss = 0.85264094
Iteration 58, loss = 0.85590973
Iteratio

array([0.65469062, 0.6953047 , 0.687     , 0.62662663, 0.63426854])

In [11]:
mlp

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(8, 8), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=True, warm_start=False)

**Does L2 penalizing improve anything?**

In [12]:
mlp = MLPClassifier(hidden_layer_sizes=(50,2), max_iter=350,
                   alpha=.01)
mlp.fit(X, Y)
cross_val_score(mlp, X, Y, cv=5)

array([0.62874251, 0.62937063, 0.63      , 0.62962963, 0.63026052])

In [13]:
mlp = MLPClassifier(hidden_layer_sizes=(50,2), max_iter=350,
                   alpha=.1)
mlp.fit(X, Y)
cross_val_score(mlp, X, Y, cv=5)

array([0.62874251, 0.62937063, 0.63      , 0.62962963, 0.63026052])

In [15]:
mlp = MLPClassifier(hidden_layer_sizes=(50,2), max_iter=350,
                   alpha=.5)
mlp.fit(X, Y)
cross_val_score(mlp, X, Y, cv=5)

array([0.62874251, 0.62937063, 0.63      , 0.62962963, 0.63026052])

**Apparently not.**

**Try treating Date as a continuous variable not vategorical, which may capture differences between artistic eras better.**

In [51]:
artworks5 = artworks.sample(5100)
artworks5 = artworks5.dropna(axis=0)
# Final column drops and NA drop.
X = artworks5.drop(['Department', 'DateAcquired', 'Artist', 'Nationality'], 1)
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5069 entries, 121305 to 92230
Data columns (total 7 columns):
Gender          5069 non-null object
Date            5069 non-null object
URL             5069 non-null bool
ThumbnailURL    5069 non-null bool
Height (cm)     5069 non-null float64
Width (cm)      5069 non-null float64
YearAcquired    5069 non-null int64
dtypes: bool(2), float64(2), int64(1), object(2)
memory usage: 207.9+ KB


In [52]:
# Create dummies separately.
nationalities = pd.get_dummies(artworks5.Nationality)
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities], axis=1)
Y = artworks5.Department
X.shape, Y.shape

((5069, 255), (5069,))

In [54]:
mlp = MLPClassifier(hidden_layer_sizes=(50,2), max_iter=500,
                   alpha=.5)
mlp.fit(X, Y)
cross_val_score(mlp, X, Y, cv=5)

array([0.62992126, 0.63017751, 0.63017751, 0.63079961, 0.63142292])