In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, OPTICS, MeanShift, estimate_bandwidth, AgglomerativeClustering, DBSCAN
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D
from itertools import cycle
import collections
import re

artists = pd.read_csv('artists.csv')
artworks = pd.read_csv('artworks.csv')
user_events = pd.read_csv('user_events.csv') 
user_favorites = pd.read_csv('user_favorites.csv')
venues = pd.read_csv('venues.csv')

In [2]:
userIDs = user_events['userId'].unique().tolist()
artworkIDs = artworks['artworkId'].unique().tolist()
artistIDs = artworks['artistId'].unique().tolist()
artwork_types = artworks['category'].unique().tolist()

artwork_count = collections.Counter(user_events['artworkId'])
user_count = collections.Counter(user_events['userId'])

artwork_artist_dict = dict()
artwork_type_dict = dict()

for ID in artworkIDs:
    artwork_artist_dict[ID] = np.unique(artworks[artworks['artworkId'] == ID]['artistId'])[0]
    artwork_type_dict[ID] = np.unique(artworks[artworks['artworkId'] == ID]['category'])[0]
    
events = np.array(collections.Counter(user_events['eventName']).most_common(4))[[0,2,3]].tolist()
events = [x[0] for x in events]
    
important_events = user_events[user_events['eventName'].isin(events)]

In [3]:
def artwork_X(artworkIDs,userIDs):
    
    matrix = np.zeros((len(artworkIDs),len(userIDs)))
    X = pd.DataFrame(matrix, columns = userIDs, index = artworkIDs)
    
    for ID1 in userIDs:
        artworks_visited = important_events[important_events['userId'] == ID1]['artworkId'].unique().tolist()
        for ID2 in artworkIDs:
            if ID2 in artworks_visited:
                X.loc[ID2,ID1] = 1
    
    return X

In [4]:
def artist_X(artistIDs,userIDs):
    
    matrix = np.zeros((len(artistIDs),len(userIDs)))
    X = pd.DataFrame(matrix, columns = userIDs, index = artistIDs)
    
    for ID1 in userIDs:
        artworks_visited = important_events[important_events['userId'] == ID1]['artworkId'].unique().tolist()
        artists_visited = []
        for a in artworks_visited:
            if a in artwork_artist_dict.keys():
                artists_visited.append(artwork_artist_dict[a])
        for ID2 in artistIDs:
            if ID2 in artists_visited:
                X.loc[ID2,ID1] = 1
    
    return X

In [5]:
# def type_X(artwork_types,userIDs):
    
#     matrix = np.zeros((len(artwork_types),len(userIDs)))
#     X = pd.DataFrame(matrix, columns = userIDs, index = artwork_types)
    
#     for ID1 in userIDs:
#         artworks_visited = important_events[important_events['userId'] == ID1]['artworkId'].unique().tolist()
#         types = []
#         for a in artworks_visited:
#             if a in artwork_type_dict.keys():
#                 types.append(artwork_type_dict[a])
#         for ID2 in artwork_types:
#             if ID2 in types:
#                 X.loc[ID2,ID1] = 1
    
#     return X

In [23]:
def pca_and_cluster_2D(x,algorithm):
    
    X = StandardScaler().fit_transform(x)
    pca = PCA(n_components=2)
    pcs = pca.fit_transform(X)
    
    df = pd.DataFrame(data = pcs
                     , columns = ['PC1', 'PC2'])
    
    if algorithm == 'K-Means':
        df['group'] = KMeans(n_clusters=4, init='k-means++', random_state=1).fit_predict(x)
    elif algorithm == 'OPTICS':
        df['group'] = OPTICS(min_samples=5).fit_predict(x)
    elif algorithm == 'Mean Shift':
        bandwidth = estimate_bandwidth(df, quantile=0.4)
        df['group'] = MeanShift(bandwidth=bandwidth, bin_seeding=True).fit_predict(x)
    elif algorithm == 'Agglomerative':
        df['group'] = AgglomerativeClustering().fit_predict(x)
    elif algorithm == 'DBSCAN':
        df['group'] = DBSCAN(eps=0.7, min_samples=5).fit_predict(x)
        
    
    fig = plt.figure(figsize = (10,10))
    ax = fig.add_subplot(1,1,1) 
    ax.set_xlabel('Principal Component 1', fontsize = 15)
    ax.set_ylabel('Principal Component 2', fontsize = 15)
    ax.set_title('2 Component PCA', fontsize = 20)


    targets = np.unique(df['group'])
    colors = cycle('bgrcmy')

    for target, color in zip(targets,colors):
        if target == -1:
            color = 'black'
        indicesToKeep = df['group'] == target
        ax.scatter(df.loc[indicesToKeep, 'PC1']
                   , df.loc[indicesToKeep, 'PC2']
                   , c = color
                   , s = 50)
    ax.legend(targets)
    ax.grid()
    
    return df

In [17]:
def pca_and_cluster_3D(x,algorithm):
    
    X = StandardScaler().fit_transform(x)
    pca = PCA(n_components=3)
    pcs = pca.fit_transform(X)
    
    df = pd.DataFrame(data = pcs
                     , columns = ['PC1', 'PC2', 'PC3'])
    
    if algorithm == 'K-Means':
        df['group'] = KMeans(n_clusters=4, init='k-means++', random_state=1).fit_predict(x)
    elif algorithm == 'OPTICS':
        df['group'] = OPTICS(min_samples=1).fit_predict(x)
    elif algorithm == 'Mean Shift':
        bandwidth = estimate_bandwidth(df, quantile=0.3)
        df['group'] = MeanShift(bandwidth=bandwidth, bin_seeding=True).fit_predict(x)
    elif algorithm == 'Agglomerative':
        df['group'] = AgglomerativeClustering().fit_predict(x)
    elif algorithm == 'DBSCAN':
        df['group'] = DBSCAN(eps=0.7, min_samples=5).fit_predict(x)
        
    
    fig = plt.figure(figsize = (10,10))
    ax = fig.add_subplot(111, projection='3d')


    targets = np.unique(df['group'])
    colors = cycle('bgrcmy')

    for target, color in zip(targets,colors):
        if target == -1:
            color = 'black'
        indicesToKeep = df['group'] == target
        ax.scatter(df.loc[indicesToKeep, 'PC1']
                   , df.loc[indicesToKeep, 'PC2']
                   , df.loc[indicesToKeep, 'PC3']
                   , c = color
                   , s = 50)
    ax.legend(targets)
    ax.grid()

    ax.set_xlabel('Principal Component 1', fontsize = 15)
    ax.set_ylabel('Principal Component 2', fontsize = 15)
    ax.set_zlabel('Principal Component 3', fontsize = 15)

    plt.show()
    
    return df

### Binary & All Artworks

In [8]:
X = artwork_X(artworkIDs,userIDs)

In [9]:
artistX = artist_X(artistIDs,userIDs)

In [25]:
X

Unnamed: 0,3F527B47CD5853F7D2C9A38D9CF09EF6F49F5FCC,2DC42BF61112AA0D52B26B64DE3AC7CC58B4A0B9,1149C0A7E1D38A5996A8750E67BFA617582578FC,374DB0BC48BFF4C1E322D565F4BF96155CB35CBD,640BD4A1001D9B9DB953241075244AD88ACD4167,1F2FA0BB66B3DEFFAB1A6665C450EEF981D003D9,D1941E173AB87A4CF271AE5C01C6154BEEEF68BF,C648E2603EAFD2650C40FD0A4743B0681FA1477D,C776E472E72DF418A79417E7DEB434DDC3D729F6,48DD682022147AC8F5D8C0239364377F3373E80A,...,1A832591E6022032A30A698FAE04FA713E038518,B6DC3CC5C38133A1BBBAB293BA79E18053BC9100,F47026CC936F70A9A1FCAD36C7A418B2083AD0EB,E16DDE3EA2E2121BB2BFB49A07A5C323DFE0BD82,378BE8BDEE9DDBCE99531497F7290A1D086A54B2,1C17166B7FD13FDC3E43F28AF8085A5D3D498F27,EF86C598E55CA05FAEDED18C78D3D9BDE367024E,3BC2576A1625FC1DC4847978140FBD6484BFBFF0,162004C19F274927E39EE9FB031B94AA8D7061A2,77B2E66AED84128838CDAF876FA317DAD79A2A27
MET_96_28,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MET_1975_1_186,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MET_1987_47_1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MET_1993_132,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
MET_66_65_1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MET_14_40_648,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MET_14_40_640,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MET_32_130_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MET_1975_1_197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# typeX = type_X(artwork_types,userIDs)

### K-Means Clustering

In [24]:
artwork_clusters = pca_and_cluster_2D(X,'OPTICS')

KeyboardInterrupt: 

In [None]:
artist_clusters = pca_and_cluster_2D(artistX,'OPTICS')

In [None]:
# type_clusters = pca_and_cluster_2D(typeX,'K-Means')

### Putting all groups together

In [None]:
group_df = artworks[['artworkId','artistId','category']].drop_duplicates()

artist_clusters['ID'] = artistIDs
artist_group = []
for ID in group_df['artistId']:
    artist_group.append(artist_clusters[artist_clusters['ID'] == ID]['group'].tolist()[0])
    
# type_clusters['type'] = artwork_types
# type_group = []
# for c in group_df['category']:
#     type_group.append(type_clusters[type_clusters['type'] == c]['group'].tolist()[0])

In [None]:
group_df['artwork group'] = artwork_clusters['group'].tolist()
group_df['artist group'] = artist_group
# group_df['type group'] = type_group

### By artistID

In [None]:
unique_a = []
# unique_t = []

for ID in artistIDs:
    z = group_df[group_df['artistId'] == ID]
    a = np.unique(z['artwork group'])
#     t = np.unique(z['type group'])
    unique_a.append(a)
#     unique_t.append(t)

In [None]:
unique_df = group_df[['artistId']].drop_duplicates()

unique_df['artist group'] = artist_clusters['group'].tolist()
unique_df['unique artwork groups'] = unique_a
# unique_df['unique type groups'] = unique_t

### By artwork group

In [None]:
group_numbers = group_df['artwork group'].unique().tolist()
artwork_list = []
artist_list = []
# type_list = []

for num in group_numbers:
    df = group_df[group_df['artwork group'] == num]
    artwork_list.append(df['artworkId'].tolist())
    artist_list.append(df['artistId'].tolist())
#     type_list.append(df['category'].tolist())
    
artwork_group_df = pd.DataFrame(group_numbers, columns = ['group number'])

artwork_group_df['artworks'] = artwork_list
artwork_group_df['artists'] = artist_list
# artwork_group_df['types'] = type_list

In [None]:
g0_artworks = dict()
g0_artists = dict()

for aw in artwork_group_df['artists'][1]:
    desc = artists[artists['artistId'] == aw]['description']
    desc.index = range(0,len(desc))
    desc = desc[0]
    if type(desc) == str:
        desc = desc.split()
    else:
        desc = ''
    words = []
    for word in desc:
        words.append(re.sub('[\W_]+', '', word))
    for word in words:
        if word in g0_artists.keys():
            g0_artists[word] += 1
        else:
            g0_artists[word] = 1
            
for aw in artwork_group_df['artworks'][2]:
    desc = artworks[artworks['artworkId'] == aw]['description']
    desc.index = range(0,len(desc))
    desc = desc[0]
    if type(desc) == str:
     t   desc = desc.split()
    else:
        desc = ''
    words = []
    for word in desc:
        words.append(re.sub('[\W_]+', '', word))
    for word in words:
        if word in g0_artworks.keys():
            g0_artworks[word] += 1
        else:
            g0_artworks[word] = 1

In [None]:
g0_artists = {k: v for k, v in sorted(g0_artists.items(), reverse = True, key=lambda item: item[1])}
g0_artworks = {k: v for k, v in sorted(g0_artworks.items(), reverse = True, key=lambda item: item[1])}

In [None]:
g0_artworks

### By artist group

In [None]:
artwork_list = []
artist_list = []
type_list = []

for num in group_numbers:
    df = group_df[group_df['artist group'] == num]
    artwork_list.append(df['artworkId'].tolist())
    artist_list.append(df['artistId'].tolist())
    type_list.append(df['category'].tolist())
    
artist_group_df = pd.DataFrame(group_numbers, columns = ['group number'])

artist_group_df['artworks'] = artwork_list
artist_group_df['artists'] = artist_list
artist_group_df['types'] = type_list

In [None]:
artist_group_df

### By type group

In [None]:
artwork_list = []
artist_list = []
type_list = []

for num in group_numbers:
    df = group_df[group_df['type group'] == num]
    artwork_list.append(df['artworkId'].tolist())
    artist_list.append(df['artistId'].tolist())
    type_list.append(df['category'].tolist())
    
type_group_df = pd.DataFrame(group_numbers, columns = ['group number'])

type_group_df['artworks'] = artwork_list
type_group_df['artists'] = artist_list
type_group_df['types'] = type_list

In [None]:
type_group_df