In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import requests
import io

In [2]:
url="http://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv"
s=requests.get(url).content
df=pd.read_csv(io.StringIO(s.decode('utf-8'))).iloc[:,1:]

In [3]:
artworks = df

In [4]:
artworks.columns

Index(['Artist', 'ConstituentID', 'ArtistBio', 'Nationality', 'BeginDate',
       'EndDate', 'Gender', 'Date', 'Medium', 'Dimensions', 'CreditLine',
       'AccessionNumber', 'Classification', 'Department', 'DateAcquired',
       'Cataloged', 'ObjectID', 'URL', 'ThumbnailURL', 'Circumference (cm)',
       'Depth (cm)', 'Diameter (cm)', 'Height (cm)', 'Length (cm)',
       'Weight (kg)', 'Width (cm)', 'Seat Height (cm)', 'Duration (sec.)'],
      dtype='object')

In [5]:
# Select Columns.
artworks = artworks[['Artist', 'Nationality', 'Gender', 'Date', 'Department',
                    'DateAcquired', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)']]

# Convert URL's to booleans.
artworks['URL'] = artworks['URL'].notnull()
artworks['ThumbnailURL'] = artworks['ThumbnailURL'].notnull()

# Drop films and some other tricky rows.
artworks = artworks[artworks['Department']!='Film']
artworks = artworks[artworks['Department']!='Media and Performance Art']
artworks = artworks[artworks['Department']!='Fluxus Collection']

# Drop missing data.
artworks = artworks.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [6]:
artworks.head()

Unnamed: 0,Artist,Nationality,Gender,Date,Department,DateAcquired,URL,ThumbnailURL,Height (cm),Width (cm)
0,Otto Wagner,(Austrian),(Male),1896,Architecture & Design,1996-04-09,True,True,48.6,168.9
1,Christian de Portzamparc,(French),(Male),1987,Architecture & Design,1995-01-17,True,True,40.6401,29.8451
2,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,34.3,31.8
3,Bernard Tschumi,(),(Male),1980,Architecture & Design,1995-01-17,True,True,50.8,50.8
4,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,38.4,19.1


In [7]:
# Get data types.
artworks.dtypes

Artist           object
Nationality      object
Gender           object
Date             object
Department       object
DateAcquired     object
URL                bool
ThumbnailURL       bool
Height (cm)     float64
Width (cm)      float64
dtype: object

In [8]:
artworks['DateAcquired'] = pd.to_datetime(artworks.DateAcquired)
artworks['YearAcquired'] = artworks.DateAcquired.dt.year
artworks['YearAcquired'].dtype

dtype('int64')

Great. Let's do some more miscellaneous cleaning.

In [55]:
# Remove multiple nationalities, genders, and artists.
artworks.loc[artworks['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons\)'
artworks.loc[artworks['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
artworks.loc[artworks['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'

# Convert dates to start date, cutting down number of distinct examples.
artworks['Date'] = pd.Series(artworks.Date.str.extract(
    '([0-9]{4})', expand=False))[:-1]

# Final column drops and NA drop.
X = artworks.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)
X_short = X.drop(['URL','ThumbnailURL'], 1)

# Create dummies separately.
artists = pd.get_dummies(artworks.Artist)
nationalities = pd.get_dummies(artworks.Nationality)
dates = pd.get_dummies(artworks.Date)

# Concat with other variables, but artists slows this wayyyyy down so we'll keep it out for now
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)
X_long = pd.concat([X, artists], axis=1)

Y = artworks.Department

In [10]:
print(len(X))
print(len(Y))

105335
105335


In [11]:
print(len(artworks.columns))

11


In [12]:
nn_test = {}

In [13]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(10,10,20))
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10, 10, 20), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [14]:
mlp.score(X, Y)

0.6831442540466132

In [15]:
from sklearn.model_selection import cross_val_score
nn_test['10,10,20'] = cross_val_score(mlp, X, Y, cv=5)

In [16]:
mlp2 = MLPClassifier(hidden_layer_sizes=(10,5,10))
mlp2.fit(X, Y)

print(mlp.score(X, Y))

nn_test['10,5,10'] = cross_val_score(mlp2, X, Y, cv=5)

0.6831442540466132


In [17]:
mlp3 = MLPClassifier(hidden_layer_sizes=(1000,10))
mlp3.fit(X, Y)

print(mlp3.score(X, Y))

nn_test['1000,10'] = cross_val_score(mlp3, X, Y, cv=5)

0.5231404566383443


In [25]:
mlp4 = MLPClassifier(alpha= 0.01,hidden_layer_sizes=(20,5,10), activation='logistic')
mlp4.fit(X, Y)

print(mlp4.score(X, Y))

nn_test['20,5,10'] = cross_val_score(mlp4, X, Y, cv=5)

0.6553757060806


In [32]:
mlp5 = MLPClassifier(hidden_layer_sizes=(100,20))
mlp5.fit(X, Y)

print(mlp5.score(X, Y))

nn_test['100,20'] = cross_val_score(mlp5, X, Y, cv=5)

0.6733659277543077


In [34]:
mlp6 = MLPClassifier(hidden_layer_sizes=(10,10,10,20), activation='logistic')
mlp6.fit(X, Y)

print(mlp6.score(X, Y))

nn_test['10,10,10,20'] = cross_val_score(mlp6, X, Y, cv=5)

0.5907438173446623


In [36]:
mlp7 = MLPClassifier(hidden_layer_sizes=(10,10,10,20))
mlp7.fit(X, Y)

print(mlp7.score(X, Y))

nn_test['10,10,10,20r'] = cross_val_score(mlp7, X, Y, cv=5)

0.6574168130251103


In [38]:
mlp8 = MLPClassifier(hidden_layer_sizes=(5,10,20,))
mlp8.fit(X, Y)

print(mlp8.score(X, Y))

nn_test['5,10,20'] = cross_val_score(mlp8, X, Y, cv=5)

0.6695780130061233


In [41]:
mlp9 = MLPClassifier(hidden_layer_sizes=(10,10,20,5))
mlp9.fit(X, Y)

print(mlp9.score(X, Y))

nn_test['10,10,20,5'] = cross_val_score(mlp9, X, Y, cv=5)

0.6666160345564153


In [51]:
mlp10 = MLPClassifier(hidden_layer_sizes=(10,10,20))
mlp10.fit(X_short, Y)

print(mlp10.score(X_short, Y))

nn_test['10,10,20_n'] = cross_val_score(mlp10, X_short, Y, cv=5)

0.6788531827028054


In [53]:
mlp11 = MLPClassifier(hidden_layer_sizes=(1000,))
mlp11.fit(X, Y)

print(mlp11.score(X, Y))

nn_test['1000,'] = cross_val_score(mlp11, X, Y, cv=5)

0.6944985047704941


In [57]:
mlp12 = MLPClassifier(hidden_layer_sizes=(10,10,20))
mlp12.fit(X_long, Y)

print(mlp12.score(X_long, Y))

nn_test['10,10,20_l'] = cross_val_score(mlp12, X_long, Y, cv=5)

0.9219632600749988


In [58]:
scores = pd.DataFrame(nn_test)
for col in scores.columns:
    print(col, scores[col].mean())

10,10,10,20 0.541499317545054
10,10,10,20r 0.5573733934150875
10,10,20 0.5819123991786077
10,10,20,5 0.5792827777832649
10,10,20_l 0.6833198299245504
10,10,20_n 0.5711852471704415
10,5,10 0.5206605713829884
100,20 0.5485144682722052
1000, 0.516906436079921
1000,10 0.5231404589957966
20,5,10 0.5453637558427537
5,10,20 0.5520272502186478


NameError: name 'done' is not defined