<a href="https://colab.research.google.com/github/amazzoli/ComponentSystemsData/blob/main/tutorials/1_Load_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Cloning the repository in the colab folder structure
%%bash
git clone https://github.com/amazzoli/ComponentSystemsData.git

Cloning into 'ComponentSystemsData'...


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os, sys

sys.path.append('/content/ComponentSystemsData/py_utils/')
import plot_utils as pu

In [8]:
from scipy.sparse import coo_matrix, lil_matrix

class ComponentSystem(object):

  """
  Class storing all the properties of a component system and computing the
  main statistical properties
  """

  def __init__(self, objects, components, count_sparse, check_consistency=False):
    """
    Args:
    - objects: pandas table with the list of objects.
    - components: pandas table with the list of components.
    - sparse_counts: pandas table with the sparse counts (i,j,n coordinate form).
    """

    self.objects = objects
    self.components = components

    # Constructing the sparse lil_matrix. It is probably more efficient to pass
    # from the coo_matrix given the way we store the values in count_sparse
    row_col_indexes = count_sparse['object_id'], count_sparse['component_id']
    self.sparse_mat = coo_matrix((count_sparse['count'].values, row_col_indexes))
    self.sparse_mat = self.sparse_mat.tolil().transpose()

    if check_consistency:
      self.check_consistency()


  def check_consistency(self):
    """
    Check if the identifiers and the summation of rows and columns in the objects
    and component tables are consistent with the sparse count matrix.
    """

    self.consistent = True

    # Checking the number of objects
    sp_sizes = np.array(cs.mat.sum(axis=0))[0]
    no_zero_objects = cs.objects[cs.objects['size'] > 0] # Objects with size 0 are not checked
    if len(sp_sizes) != max(no_zero_objects.index) + 1:
      print('The number of objects in the sparse matrix is different than in the objects table')
      self.consistent = False

    # Checking that the sparse matrix size matche with the one in the table
    sizes_series = pd.DataFrame(sp_sizes)
    sizes_series[1] = sizes_series.index.map(no_zero_objects['size']).fillna(0)
    sizes_series[2] = sizes_series[0] != sizes_series[1]
    n_no_match  = np.sum(sizes_series[2])
    if n_no_match > 0:
      print('Sizes reported in the table do not match the ones in the sparse matrix')
      self.consistent = False

    # Checking the number of components
    sp_abundances = np.array(cs.mat.sum(axis=1)).T[0]
    no_zero_components = cs.components[cs.components['abundance'] > 0]
    if len(sp_abundances) != max(no_zero_components.index) + 1:
      print('The number of components in the sparse matrix is different than in the components table')
      self.consistent = False

    # Checking that the sparse matrix abundances matche with the one in the table
    ab_series = pd.DataFrame(sp_abundances)
    ab_series[1] = ab_series.index.map(no_zero_components['abundance']).fillna(0)
    ab_series[2] = ab_series[0] != ab_series[1]
    n_no_match  = np.sum(ab_series[2])
    if n_no_match > 0:
      print('Abundances reported in the table do not match the ones in the sparse matrix')
      self.consistent = False

    if self.consistent:
      print('The tables are consistent')




repo_folder = '/content/ComponentSystemsData/'
metadata = pd.read_csv(repo_folder + 'metadata.tsv', sep='\t', index_col=0)


def load_system(label):
  """
  Return a ComponentSystem object for the given label. Labels are listed in the
  metadata.tsv file.
  """

  if label not in metadata.index:
    raise ValueError('Label {} not found in metadata.tsv'.format(label))

  data_folder = 'datasets/{}/data/'.format(label)
  objects = pd.read_csv(repo_folder + data_folder + 'objects.tsv', sep='\t', index_col=0)
  components = pd.read_csv(repo_folder + data_folder + 'components.tsv', sep='\t', index_col=0)
  count_sparse = pd.read_csv(repo_folder + data_folder + 'count_sparse.zip', sep='\t', compression='zip')

  return ComponentSystem(objects, components, count_sparse)

In [9]:
cs = load_system('legos')

In [29]:
cs.objects.iloc[26000]

Unnamed: 0,14628
set_id,8165-1
name,Monster Jumper
year,2009.0
n_themes,1
theme1,Racers
theme2,
theme3,
size,96
vocabulary,41


In [25]:
a = cs.sparse_mat[:,20000].toarray().T[0]
parts = np.arange(len(a))[a > 0]
cs.components.loc[parts]

Unnamed: 0_level_0,name,part_id,color_id,color,category,material,abundance,occurrence
sparse_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
6298,Legs Short with Bright Light Orange Feet Pattern,16709pat38,15,White,Minifig Lower Body,Plastic,6,6
13141,Minifig Hipwear Tail Duck Costume,24779,15,White,Minifig Hipwear,Flexible Plastic,13,12
42158,"Minifig Head Special, Duck with Forehead Tuft,...",41594pr2735,15,White,Minifig Heads,Plastic,6,6
42160,"Hat / Cap Large, 5 Seams, with Pin Attachment",41597,10,Bright Green,Minifig Headwear,Plastic,2,2
79460,"Torso, Bright Green Arms, White Hands [Plain]",973c06h27,10,Bright Green,Minifig Upper Body,Plastic,2,2


In [17]:
cs.sparse_mat.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])