Skip to content

Commit

Permalink
Add Michael Lee's collection of similarity matrices
Browse files Browse the repository at this point in the history
  • Loading branch information
dekuenstle committed Jul 30, 2021
1 parent 6839a9f commit dba3829
Show file tree
Hide file tree
Showing 4 changed files with 222 additions and 0 deletions.
1 change: 1 addition & 0 deletions cblearn/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from ._things_similarity import fetch_things_similarity
from ._imagenet_similarity import fetch_imagenet_similarity
from ._car_similarity import fetch_car_similarity
from ._similarity_matrix import fetch_similarity_matrix

from ._triplet_simulation import make_all_triplets
from ._triplet_simulation import make_random_triplets
Expand Down
129 changes: 129 additions & 0 deletions cblearn/datasets/_similarity_matrix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import logging
import logging
import os
import zipfile
from os.path import join
from pathlib import Path
from typing import Optional, Union
from urllib.request import urlretrieve

import joblib
import numpy as np
import scipy.io
from sklearn.datasets import _base
from sklearn.utils import Bunch

ARCHIVE = _base.RemoteFileMetadata(
filename='all.zip',
url='https://files.osf.io/v1/resources/ey9vp/providers/osfstorage/'
'5e7a7065d2927f006fdd1cf9?action=download&direct&version=1',
checksum=('8c799cdebb00192ecb63f3e28c6eeee0e2f64fcb8dad3bc68982e551f2ae5b1c'))

logger = logging.getLogger(__name__)

AVAILABLE_SIMILARITIES = [
'fruit2_romney', 'nonsense_romney', 'furniture_romney', 'kinship_kimrosenberg', 'rectangle_kruschke',
'vegetables2_romney', 'animalpictures5', 'auditory', 'druguse', 'faces11', 'fruits', 'dotpatterns',
'furniture2_romney', 'bodies_viken', 'textures', 'sport_romney', 'bankwiring', 'morsenumbers',
'faces_busey', 'letters', 'vehicles_romney', 'vehicles2_romney', 'birds_romney', 'fruit_romney', 'risks',
'morseall', 'texturemit_heaps', 'cartoonfaces', 'country_robinsonhefner', 'congress', 'phonemes',
'toys_romney', 'colour', 'countriessim', 'faces5', 'tools_romney', 'lines_cohen', 'abstractnumbers',
'countriesdis', 'animalnames11', 'faces_steyvers', 'weapons2_romney', 'texturebrodatz_heaps',
'fish_romney', 'flowerpots', 'sizeangle_treat', 'clothing2_romney', 'weapons_romney', 'clothing_romney',
'animalnames5', 'vegetables_romney', 'animalpictures11']


def fetch_similarity_matrix(name: str, data_home: Optional[os.PathLike] = None, download_if_missing: bool = True
) -> Union[Bunch, np.ndarray]:
""" Load human similarity judgements, aggregated to a similarity matrix.
This function provides access to the following similarity matrices:
`fruit2_romney, nonsense_romney, furniture_romney, kinship_kimrosenberg, rectangle_kruschke, vegetables2_romney,
animalpictures5, auditory, druguse, faces11, fruits, dotpatterns, furniture2_romney, bodies_viken,
textures, sport_romney, bankwiring, morsenumbers, faces_busey, letters, vehicles_romney, vehicles2_romney,
birds_romney, fruit_romney, risks, morseall, texturemit_heaps, cartoonfaces, country_robinsonhefner, congress,
phonemes, toys_romney, colour, countriessim, faces5, tools_romney, lines_cohen, abstractnumbers, countriesdis,
animalnames11, faces_steyvers, weapons2_romney, texturebrodatz_heaps, fish_romney, flowerpots, sizeangle_treat,
clothing2_romney, weapons_romney, clothing_romney, animalnames5, vegetables_romney, animalpictures11`.
See :ref:`similarity_matrix_dataset` for a detailed description.
>>> dataset = fetch_similarity_matrix('colour') # doctest: +REMOTE_DATA
>>> dataset.labels[:2].tolist() # doctest: +REMOTE_DATA
['434', '445']
>>> dataset.similarity.shape # doctest: +REMOTE_DATA
(14, 14)
Args:
name: Name of the similarity dataset
data_home : optional, default: None
Specify another download and cache folder for the datasets. By default
all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
download_if_missing : optional, default=True
Returns:
dataset : :class:`~sklearn.utils.Bunch`
Dictionary-like object, with the following attributes.
similarity : ndarray, shape (n_objects, n_objects)
Symmetric matrix of normalized object similarities.
None for some datasets.
proximity : ndarray, shape (n_objects, n_objects)
Symmetric matrix of normalized pairwise proximities.
None for some datasets.
n_objects: int
Number of objects
labels : (n_objects,)
Single word describing each object
sigma: float
Uncertainty of the similarity values.
Not available for all datasets.
DESCR : string
Description of the dataset.
Raises:
IOError: If the data is not locally available, but download_if_missing=False
"""
if name not in AVAILABLE_SIMILARITIES:
raise ValueError(f"Unexpected similarity name = {name}. Use one of {AVAILABLE_SIMILARITIES}.")

data_home = Path(_base.get_data_home(data_home=data_home))
if not data_home.exists():
data_home.mkdir()

basepath = Path(_base._pkl_filepath(data_home, 'similarity_collection/'))
filepath = basepath.joinpath(f'{name}.pkz')
if not filepath.exists():
if not download_if_missing:
raise IOError("Data not found and `download_if_missing` is False")

logger.info('Downloading imagenet similarity data from {} to {}'.format(ARCHIVE.url, data_home))

archive_path = _base._fetch_remote(ARCHIVE, dirname=data_home)
if not basepath.exists():
basepath.mkdir(parents=True)
with zipfile.ZipFile(archive_path) as zf:
for _this_name in AVAILABLE_SIMILARITIES:
with zf.open(f'{_this_name}.mat', 'r') as f:
_raw = scipy.io.loadmat(f)
_this_dict = {
'similarity': np.array(_raw.get('s', None)),
'proximity': np.array(_raw.get('d', None)),
'n_objects': int(_raw['n']),
'labels': np.array(_raw['labs'], dtype=str),
'sigma': float(_raw.get('sigma_emp', np.nan)),
}
_this_filepath = basepath.joinpath(f'{_this_name}.pkz')
joblib.dump(_this_dict, _this_filepath, compress=6)
if name == _this_name:
data_dict = _this_dict
os.remove(archive_path)
else:
data_dict = joblib.load(filepath)

module_path = Path(__file__).parent
with module_path.joinpath('descr', 'similarity_matrix_dataset.rst').open() as rst_file:
fdescr = rst_file.read()

return Bunch(**data_dict,
DESCR=fdescr)
91 changes: 91 additions & 0 deletions cblearn/datasets/descr/similarity_matrix_dataset.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
.. _similarity_matrix_dataset:

Similarity Judgement Matrix datasets
------------------------------------

`This collection`_ provides similarity matrices from human similarity judgments on
various different stimuli. The collection was aggregated and published by Michael Lee.

.. _This collection: https://osf.io/ey9vp/


**Data Sets:**

These are Michael Lee's descriptions of the datasets (with minor modifications):

abstractnumbers
Human judgments of the numbers 0-9. From research described in Shepard, R. N., Kilpatrick, D. W., & Cunningham, J. P. (1975). The internal representation of numbers. Cognitive Psychology, 7, 82-138 (with thanks to Josh Tenenbaum).
auditory
Auditory confusions of 25 letters (all excluding ‘o’) and the numbers 0-9. From research reported in Kuennapas, T., & Janson, A-J. (1969). Multidimensional Similarity of Letters. Perceptual and Motor Skills, 28, 3-12.
bankwiring
A sociologist’s judgment of the relationships between 14 bank wiring workers. From research reported in Roethlisberger, F. J., & Dickson, W. J. (1939). Management and the worker. Cambridge, MA: Harvard University Press.
colours
Human judgments of 14 colours, specified by their wavelengths. From research reported in Ekman, G. (1954). Dimensions of color vision. The Journal of Psychology, 38, 467-474.
congress
Voting patterns of 14 members of congress on environmental bills. From raw data presented in Romesburg, H. C. (1984). Cluster analysis for researchers. Belmont, CA: Lifetime Learning Publications.
dotpatterns
Human judgments of 17 dot patterns. From research reported in Glushko, R. J. (1975). Pattern goodness and redundancy revisited: Multidimensional scaling and hierarchical cluster analysis. Perception & Psychophysics, 17(2), 158-162.
druguse
Reported adolescent use of 13 drug types. From research reported in Huba, G. L., Wingard, J. A., & Bentler, P. M. (1981). A comparison of two latent variable causal models for adolescent drug use. Journal of Personality and Social Psychology, 40(1), 180-193.
flowerpots
Human judgments of 16 drawings of flowerpots. From research reported in Gati, I., & Tversky, A. (1982). Representations of qualitative and quantitative dimensions. Journal of Experimental Psychology: Human Perception and Performance, 8(2), 325-340.
fruits
Human judgments of 21 fruits. From research reported in Tversky, A., & Hutchinson, J. W. (1986). Nearest Neighbor Analysis of Psychological Spaces. Psychological Review, 93(1), 3-22.
letters
Kindergarten children’s judgment of perceptual similarity of the 26 capital letters. From research reported in Gibson, E. J., Osser, H., Schiff, W., & Smith, J. (1963). An analysis of critical features of letters, tested by a confusion matrix. Cooperative Research Project No. 639, U.S. Office of Education.
morseall and morsenumbers
Confusion of Morse code numerals and numeral and letters. From research reported in Rothkopf, E. Z. (1957). A measure of stimulus similarity and errors in some paired-associate learning tasks. Journal of Experimental Psychology, 53, 94-101.
phonemes
Auditory confusion of 16 consonant phonemes. From research reported in Miller, G. A., & Nicely, P. E. (1955). An analysis of perceptual confusions among some English consonants. Journal of the Acoustical Society of America, 27, 338-352.
risks
Human judgments of 18 risks. From research reported in Johnson, E. J., & Tversky, A. (1984). Representations of Perceptions of Risks. Journal of Experimental Psychology: General, 113(1), 55-70.
rectangles
Human judgments of 16 rectangles. From research described in Chapter 15 of Borg, I., & Lingoes, J. (1987). Multidimensional similarity structure analysis. New York: Springer Verlag.

The following datasets contain also a empirical estimate of the precision of the similarity measurements:

country_robinsonhefner
Human judgments (in 1967) of 17 countries. From research reported in Robinson, J. P., & Hefner, R (1967).
Multidimensional Differences in Public and Academic Perceptions of Nations. Journal of Personality and Social Psychology, 7(3), 251-259.
rectangles_kruschke
Human judgments of 8 rectangles with interior line segments.
From research reported in Kruschke, J. K. (1993). Human category learning: Implications for backpropagation models. Connection Science, 5, 3-36.
kinship_rosenbergkim
Human judgments of 15 kinship terms. From research reported in Rosenberg, S., & Kim, M. P. (1975).
The Method of Sorting as a Data-Generating Procedure in Multivariate Research. Multivariate Behavioral Research, 10, 489-502.
romney name datasets:
Human judgments of 21 bird names, 21 clothing names, 21 different clothing names, 21 fish names,
21 fruit names, 21 different fruit names, 21 furniture names, 21 different furniture names,
21 semantically unrelated words, 21 sport names, 21 tool names, 21 toy names,
21 vegetable names, 21 different vegetable names, 21 vehicle names, 21 different vehicle names,
21 weapon names, 21 different weapon names.
All from research reported in Romney, A. K., Brewer, D. D., & Batchelder, W. H. (1993).
Predicting Clustering from Semantic Structure. Psychological Science, 4(1), 28-34, with thanks to Devon Brewer.

`birds_romney, clothing_romney, clothing2_romney, fish_romney, fruit_romney, fruit2_romney,
furniture_romney, furniture2_romney, nonsense_romney, sport_romney, tools_romney, toys_romney,
vegetables_romney, vegetables2_romney, vehicles_romney, vehicles2_romney, weapons_romney, and weapons2_romney`.
lines_cohen, faces_busey, faces_steyvers, sizeangle_treat, and bodies_viken
Human judgments of 9 lines of different lengths, 60 faces, 7 ‘morphed’ faces, 9 shapes varying in size and angle, 24 bodies varying in “affect and body size”. Mark Steyvers kindly provided Michael Lee with all of these.
texturebrodatz_heaps and texturemit_heaps
Human judgments of 30 Brodatz textures, and 24 MIT textures. Both from research reported in Heaps, C., & Handel, S. (1999). Similarity and Features of Natural Textures. Journal of Experimental Psychology: Human Perception and Performance, 25(2), 299-320.
cartoonfaces, countriessim, and countriesdis
Human judgments of 10 cartoon faces, and forced-choice judgments of 16 countries in a similarity condition and a dissimilarity condition. From the research described in Navarro, D.J., & Lee, M.D. (2004). Common and distinctive features in stimulus representation: A modified version of the contrast model. Psychonomic Bulletin & Review, 11(6), 961–974, and Navarro, D.J., & Lee, M.D. (2002). Commonalities and distinctions in featural stimulus representations. In W.G. Gray & C. D. Schunn, (Eds.), Proceedings of the 24th Annual Conference of the Cognitive Science Society, pp. 685-690. Mahwah, NJ: Erlbaum.
animalpictures5, animalpictures11, and animalpictures21,
Human judgments of 21 animals (presented as pictures on a 5 point scale), of 21 animals (presented as pictures on a 5 point scale),
of 21 animals (presented as pictures on an 11 point scale).
From (as yet; probably never-to-be) unreported research Michael Lee did a while back.
animalnames5, animalnames11
Human judgments of 21 animals (presented as words on a 5 point scale), of 21 animals (presented as words on an 11 point scale)
From (as yet; probably never-to-be) unreported research Michael Lee did a while back.
faces5 and faces11
Human judgements of 25 faces (5 point scale), and of 25 faces (11 point scale).
From (as yet; probably never-to-be) unreported research Michael Lee did a while back.


Please cite the dataset's paper if you use a it in publications.

These datasets can be downloaded using the :func:`cblearn.datasets.fetch_similarity_matrix` with the
corresponding name parameter. Triplet trials can be generated by using 1 - the similarity matrix as a precomputed
distance matrix: `cblearn.dataset.make_random_triplets(1 - data.similarity, distance='precomputed')`.

1 change: 1 addition & 0 deletions docs/references/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ Loaders
datasets.fetch_musician_similarity
datasets.fetch_vogue_cover_similarity
datasets.fetch_things_similarity
datasets.fetch_similarity_matrix


Simulations
Expand Down

0 comments on commit dba3829

Please sign in to comment.