In [1]:
#if the below packages are not installed, uncomment and install first
# import sys
# !{sys.executable} -m pip install pyvis
# !{sys.executable} -m pip install jsonpickle

import pandas as pd
from datetime import datetime
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import pyvis as pv

artists = pd.read_csv('artists.csv') # Contains artist info, i.e. name/d.o.b./d.o.d.
artworks = pd.read_csv('artworks.csv') # Contains info on artworks, i.e. title/date/category
user_events = pd.read_csv('user_events.csv') 
user_favorites = pd.read_csv('user_favorites.csv') # Possibly more of a use for Zitong but could be used
venues = pd.read_csv('venues.csv') # Only one, this file is not used in the data analysis

user_events = user_events.sort_values('timestamp')

# Pre-processing

In [2]:
#find unique user IDs
IDs = user_events['userId'].unique()

#find unique artwork IDs
artworkIDs = artworks['artworkId'].unique().tolist()

#transform date into datetime
dates = user_events['timestamp'].tolist()
new_date = list()

for i in range(0, 44937):
    new_date.append(datetime.strptime(dates[i], '%Y-%m-%d %H:%M:%S.%f'))

user_events['timestamp'] = new_date

# Analysis Functions

In [3]:
def country_interactions(user_events):
    '''
    Input must be a pandas dataframe with column, "userCountry"
    The function will find the total number of interactions each country has on the app.
    Returns an ordered dictionary.
    '''
    
    unique_countries = user_events['userCountry'].unique().tolist()
    user_countries = user_events['userCountry'].tolist()
    
    interaction_count = dict()
    
    for c in unique_countries:
        interaction_count[c] = user_countries.count(c)
    
    interaction_count = {k: v for k, v in sorted(interaction_count.items(), reverse = True, key=lambda item: item[1])}

    return(interaction_count)

In [4]:
country_interactions = country_interactions(user_events)

In [5]:
def city_interactions(user_events):
    '''
    Input must be a pandas dataframe with column, "userCity"
    The function will find the total number of interactions each city has on the app.
    Returns an ordered dictionary.
    '''
    
    unique_cities = user_events['userCity'].unique().tolist()
    user_cities = user_events['userCity'].tolist()

    city_count = dict()
    for c in unique_cities:
        city_count[c] = user_cities.count(c)
    
    city_count = {k: v for k, v in sorted(city_count.items(), reverse = True, key=lambda item: item[1])}

    return(city_count)

In [6]:
city_interactions = city_interactions(user_events)

In [7]:
def user_interactions(user_events):
    '''
    Input must be a pandas dataframe with column, "userId"
    The function will find the total number of interactions each user has on the app.
    Returns an ordered dictionary.
    '''
    
    user_ids = user_events['userId'].tolist()
    unique_ids = user_events['userId'].unique().tolist()
    
    user_interactions = dict()
    
    for ID in unique_ids:
        user_interactions[ID] = user_ids.count(ID)
    
    user_interactions = {k: v for k, v in sorted(user_interactions.items(), reverse = True, key=lambda item: item[1])}

    return(user_interactions)

In [8]:
user_interactions = user_interactions(user_events)

In [9]:
def user_country_count(user_events):
    '''
    Input must be a pandas dataframe with columns, "userCountry" and "userId"
    The function will find the number of users from each country on the app that are interacting with the artworks.
    Returns an ordered dictionary.
    '''
    
    user_countries = [x[0] for x in user_events.groupby(user_events['userId'])['userCountry'].unique().tolist()]
    
    country_count = dict()
    
    for c in user_countries:
        country_count[c] = user_countries.count(c)
    
    country_count = {k: v for k, v in sorted(country_count.items(), reverse = True, key=lambda item: item[1])}

    return(country_count)

In [10]:
user_countries = user_country_count(user_events)

In [11]:
def artwork_count(user_events,artworks):
    '''
    Input must be two pandas dataframes, "user_events" and "artworks" which both must contain the column
    "artworkId"
    The function will find the number of interactions with each artwork in the museum/gallery.
    Returns an ordered dictionary.
    '''
    
    unique_artworkIDs = artworks['artworkId'].unique().tolist()
    artworkIDs = user_events['artworkId'].tolist()
    
    
    artwork_count = dict()

    for ID in unique_artworkIDs:
        artwork_count[ID] = artworkIDs.count(ID)
       
    artwork_count = {k: v for k, v in sorted(artwork_count.items(), reverse = True, key=lambda item: item[1])}

    return(artwork_count)

In [12]:
artwork_count = artwork_count(user_events,artworks)

# Graphing Function

In [13]:
def create_sessions(user_events,max_time):
    sessions = list()
    sesh = list()

    for ID in IDs:
        e = user_events[user_events['userId'] == ID]
        e.index = range(len(e))
        new = True
        
        for i in range(1,len(e)):
            if (e['timestamp'][i]-e['timestamp'][i-1]).seconds/60 < 30:
                if new:
                    sesh.append(e['artworkId'][i-1])
                    new = False
                sesh.append(e['artworkId'][i])
            else:
                if len(sesh) > 0:
                    sessions.append(sesh)
                    sesh = list()
                    new = True
        if len(sesh) > 0:
            sessions.append(sesh)
            sesh = list()
                    
    return(sessions)

In [14]:
sessions = create_sessions(user_events,30)

In [15]:
def create_wm(sessions,artworkIDs):
    matrix = np.zeros((len(artworkIDs),len(artworkIDs)))
    weight_matrix = pd.DataFrame(matrix, columns = artworkIDs, index = artworkIDs)
    
    for sesh in sessions:
        if len(set(sesh)) > 1:
            for i in range(0,len(sesh)-1):
                if sesh[i] != sesh[i+1]:
                    weight_matrix.loc[sesh[i],sesh[i+1]] += 1
                    
    return(weight_matrix)


In [16]:
weight_mat = create_wm(sessions,artworkIDs)

In [17]:
def create_network(artworkIDs,artwork_count,weight_matrix,min_weight):
    
    G=nx.DiGraph()

    for ID1 in artworkIDs:
        for ID2 in artworkIDs:
            if weight_matrix.loc[ID1,ID2]>min_weight:
                G.add_node(ID1, 
                           group = artworks[artworks['artworkId'] == ID1]['category'].tolist()[0], 
                           title = artworks[artworks['artworkId'] == ID1]['title'].tolist()[0], 
                           size = artwork_count[ID1]**(1/3))
                G.add_node(ID2, 
                           group = artworks[artworks['artworkId'] == ID2]['category'].tolist()[0], 
                           title = artworks[artworks['artworkId'] == ID2]['title'].tolist()[0], 
                           size = artwork_count[ID2]**(1/3))
                G.add_edge(ID1,ID2,value=weight_matrix.loc[ID1,ID2]**(1/4))
                
    return(G)

In [19]:
G10 = create_network(artworkIDs,artwork_count,weight_mat,10)
G5 = create_network(artworkIDs,artwork_count,weight_mat,5) # Similar to 10
#G2 = create_network(artworkIDs,artwork_count,weight_mat,2) # Main cluster plus couple outside 
#G0 = create_network(artworkIDs,artwork_count,weight_mat,0) # TOO LARGE DO NOT PLOT

In [21]:
from pyvis.network import Network
largest = max(nx.connected_component_subgraphs(G5.to_undirected()), key=len)

g = Network(height = "100%", width = "100%")

g.from_nx(largest)
g.set_options("""
var options = {
  "nodes": {
    "font": {
      "size": 4,
      "face": "tahoma"
    }
  },
  "edges": {
    "arrows": {
      "to": {
        "enabled": true,
        "scaleFactor": 0.5
      }
    },
    "arrowStrikethrough": false,
    "color": {
      "inherit": true,
      "opacity": 0.6
    },
    "smooth": false
  },
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -2400,
      "springLength": 115,
      "damping": 0.5,
      "avoidOverlap": 1
    },
    "maxVelocity": 36,
    "minVelocity": 0.75
  }
}
""")

g.show("testing.html")