# As a first step we do some exploratory data analysis

In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
print (os.getcwd())

C:\Users\alexa\Downloads


In [3]:
df = pd.read_csv('Artworks.csv', encoding = 'utf8')

In [4]:
df.shape

(138151, 29)

In [5]:
df.head()

Unnamed: 0,Title,Artist,ConstituentID,ArtistBio,Nationality,BeginDate,EndDate,Gender,Date,Medium,...,ThumbnailURL,Circumference (cm),Depth (cm),Diameter (cm),Height (cm),Length (cm),Weight (kg),Width (cm),Seat Height (cm),Duration (sec.)
0,"Ferdinandsbrücke Project, Vienna, Austria (Ele...",Otto Wagner,6210,"(Austrian, 1841–1918)",(Austrian),(1841),(1918),(Male),1896,Ink and cut-and-pasted painted pages on paper,...,http://www.moma.org/media/W1siZiIsIjU5NDA1Il0s...,,,,48.6,,,168.9,,
1,"City of Music, National Superior Conservatory ...",Christian de Portzamparc,7470,"(French, born 1944)",(French),(1944),(0),(Male),1987,Paint and colored pencil on print,...,http://www.moma.org/media/W1siZiIsIjk3Il0sWyJw...,,,,40.6401,,,29.8451,,
2,"Villa near Vienna Project, Outside Vienna, Aus...",Emil Hoppe,7605,"(Austrian, 1876–1957)",(Austrian),(1876),(1957),(Male),1903,"Graphite, pen, color pencil, ink, and gouache ...",...,http://www.moma.org/media/W1siZiIsIjk4Il0sWyJw...,,,,34.3,,,31.8,,
3,"The Manhattan Transcripts Project, New York, N...",Bernard Tschumi,7056,"(French and Swiss, born Switzerland 1944)",(),(1944),(0),(Male),1980,Photographic reproduction with colored synthet...,...,http://www.moma.org/media/W1siZiIsIjEyNCJdLFsi...,,,,50.8,,,50.8,,
4,"Villa, project, outside Vienna, Austria, Exter...",Emil Hoppe,7605,"(Austrian, 1876–1957)",(Austrian),(1876),(1957),(Male),1903,"Graphite, color pencil, ink, and gouache on tr...",...,http://www.moma.org/media/W1siZiIsIjEyNiJdLFsi...,,,,38.4,,,19.1,,


In [6]:
df.describe()

Unnamed: 0,ObjectID,Circumference (cm),Depth (cm),Diameter (cm),Height (cm),Length (cm),Weight (kg),Width (cm),Seat Height (cm),Duration (sec.)
count,138151.0,10.0,13839.0,1462.0,120355.0,742.0,290.0,119434.0,0.0,2140.0
mean,97170.256618,44.86802,16.353863,23.094845,37.456124,89.687579,1283.674965,37.973398,,6156.488
std,81950.72057,28.631604,54.49596,44.626483,49.604159,329.428165,12017.50424,67.277097,,137125.0
min,2.0,9.9,0.0,0.635,0.0,0.0,0.09,0.0,,0.0
25%,36671.5,23.5,0.0,7.7788,17.938786,17.1,5.7267,17.5,,120.0
50%,73896.0,36.0,0.317501,13.6525,27.8,26.7,20.1851,25.400051,,433.0
75%,141636.5,71.125,9.84251,24.98095,43.9,79.7,77.6785,44.2,,1620.0
max,419289.0,83.8,1808.483617,914.4,9140.0,8321.0566,185067.585957,9144.0,,6283065.0


In [7]:
relevant= df[['Nationality', 'Medium', 'Date']]

In [8]:
relevant['Medium'].nunique()

21250

# We see here that the 'Medium' column has 21250 distinct values (out of 138.151 total values). In order to be able to investigate any changes in the materials used, we therefore plan to count for each artwork the number of words used to describe the Medium. This gives us a rough indication of the degree of difficulty in conveying what an artwork is or what an artwork consists of, and can therefore be applied as a proxy for the complexity of specific artworks. 

In [9]:
pd.set_option('display.max_colwidth', 1000)

# In order to add the wordcounts of medium per each artwork we first convert a specific column of the csv file into a dictionary

In [10]:
dictionary = df['Medium'].to_dict()

# Next, we remove any punctuation from the dictionary value

In [11]:
def remove_punc(text):
    punctuation = '!@#$%^&*()_-+={}[]:;"\'|<>,.?/~`'
    return ''.join(character for character in text
                   if character not in punctuation)

for key, value in dictionary.items():
    try:
        dictionary[key] = remove_punc(dictionary[key])
    except:
        pass

# Then, we import the nltk library, and count the tokenized values to each dictionary key. The results are stored in a seperate dictionary for now under the header 'wordcount'. Note: NaN values remain NaN values.

In [12]:
from nltk.tokenize import word_tokenize

In [13]:
dictionary2 = {'Wordcount' : []}
print(dictionary2)

{'Wordcount': []}


In [14]:
for key, value in dictionary.items():
    try:
        dictionary2['Wordcount'].append(len(word_tokenize(value)))
    except:
        dictionary2['Wordcount'].append('Nan')
        
print(dictionary2)

{'Wordcount': [7, 6, 10, 6, 9, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 6, 6, 8, 14, 8, 6, 8, 4, 6, 4, 4, 4, 4, 4, 4, 4, 7, 6, 9, 6, 9, 9, 9, 9, 9, 4, 9, 9, 9, 9, 9, 9, 9, 7, 1, 8, 7, 5, 7, 5, 6, 3, 7, 5, 6, 7, 6, 'Nan', 9, 3, 3, 9, 3, 3, 6, 6, 6, 6, 9, 7, 3, 6, 6, 3, 3, 4, 3, 7, 7, 7, 6, 7, 7, 4, 4, 4, 4, 4, 4, 6, 9, 10, 12, 8, 3, 5, 7, 3, 7, 1, 5, 7, 3, 4, 5, 4, 7, 5, 7, 7, 7, 4, 7, 5, 7, 7, 7, 7, 7, 4, 4, 4, 7, 4, 6, 4, 1, 7, 7, 7, 5, 17, 7, 6, 5, 7, 7, 9, 7, 11, 4, 5, 5, 4, 5, 5, 5, 4, 4, 5, 7, 2, 2, 2, 2, 2, 6, 3, 3, 4, 5, 4, 5, 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 3, 12, 3, 10, 5, 4, 4, 4, 4, 4, 3, 4, 3, 4, 4, 6, 4, 4, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 4, 4, 4, 3, 6, 3, 4, 4, 4, 4, 4, 4, 3, 6, 6, 6, 6, 5, 12, 16, 16, 1, 7, 5, 6, 12, 4, 6, 8, 3, 8, 3, 3, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 5, 7, 4, 8, 4, 4, 12, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 6, 4, 5, 6, 7, 7, 7, 5, 3, 4, 5, 15, 4, 4, 4, 4, 4, 4, 4, 5, 4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 8, 6, 5, 6, 1

# Next, we convert the newly created dictionary into a dataframe as well, and merge this second dataframe with the existing dataframe.

In [15]:
df3 = pd.DataFrame(dictionary2)

In [16]:
join_df = df.join(df3)

In [17]:
print(join_df)

                                                                                                                Title  \
0                                          Ferdinandsbrücke Project, Vienna, Austria (Elevation, preliminary version)   
1       City of Music, National Superior Conservatory of Music and Dance, Paris, France, View from interior courtyard   
2                                                       Villa near Vienna Project, Outside Vienna, Austria, Elevation   
3                    The Manhattan Transcripts Project, New York, New York, Introductory panel to Episode 1: The Park   
4                                                       Villa, project, outside Vienna, Austria, Exterior perspective   
...                                                                                                               ...   
138146                                                                                                       Untitled   
138147                          

# Additionally, we remove any columns from the dataframe that are not of particular interest to our project. 

In [18]:
del join_df['Circumference (cm)']

In [19]:
del join_df['Depth (cm)']

In [20]:
del join_df['Diameter (cm)']

In [21]:
del join_df['Height (cm)']

In [22]:
del join_df['Length (cm)']

In [23]:
del join_df['Weight (kg)']

In [24]:
del join_df['Width (cm)']

In [25]:
del join_df['Seat Height (cm)']

In [26]:
del join_df['Duration (sec.)']

In [27]:
del join_df['ConstituentID']

In [28]:
del df['AccessionNumber']
del df['DateAcquired']
del df['Cataloged']
del df['ObjectID']
del df['ThumbnailURL']

# We now have the first version of our dataset, altered specifally to our project. The first version is stored in a separate csv file. 

In [29]:
print(join_df)

                                                                                                                Title  \
0                                          Ferdinandsbrücke Project, Vienna, Austria (Elevation, preliminary version)   
1       City of Music, National Superior Conservatory of Music and Dance, Paris, France, View from interior courtyard   
2                                                       Villa near Vienna Project, Outside Vienna, Austria, Elevation   
3                    The Manhattan Transcripts Project, New York, New York, Introductory panel to Episode 1: The Park   
4                                                       Villa, project, outside Vienna, Austria, Exterior perspective   
...                                                                                                               ...   
138146                                                                                                       Untitled   
138147                          

In [30]:
join_df.to_csv("C:/Users/alexa/OneDrive/Documenten/Collecting Data & Tools and Methods/MoMaDatasetVersion1.csv")

NameError: name 'join_df' is not defined