-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Michael Lee's collection of similarity matrices
- Loading branch information
1 parent
6839a9f
commit dba3829
Showing
4 changed files
with
222 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
import logging | ||
import logging | ||
import os | ||
import zipfile | ||
from os.path import join | ||
from pathlib import Path | ||
from typing import Optional, Union | ||
from urllib.request import urlretrieve | ||
|
||
import joblib | ||
import numpy as np | ||
import scipy.io | ||
from sklearn.datasets import _base | ||
from sklearn.utils import Bunch | ||
|
||
ARCHIVE = _base.RemoteFileMetadata( | ||
filename='all.zip', | ||
url='https://files.osf.io/v1/resources/ey9vp/providers/osfstorage/' | ||
'5e7a7065d2927f006fdd1cf9?action=download&direct&version=1', | ||
checksum=('8c799cdebb00192ecb63f3e28c6eeee0e2f64fcb8dad3bc68982e551f2ae5b1c')) | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
AVAILABLE_SIMILARITIES = [ | ||
'fruit2_romney', 'nonsense_romney', 'furniture_romney', 'kinship_kimrosenberg', 'rectangle_kruschke', | ||
'vegetables2_romney', 'animalpictures5', 'auditory', 'druguse', 'faces11', 'fruits', 'dotpatterns', | ||
'furniture2_romney', 'bodies_viken', 'textures', 'sport_romney', 'bankwiring', 'morsenumbers', | ||
'faces_busey', 'letters', 'vehicles_romney', 'vehicles2_romney', 'birds_romney', 'fruit_romney', 'risks', | ||
'morseall', 'texturemit_heaps', 'cartoonfaces', 'country_robinsonhefner', 'congress', 'phonemes', | ||
'toys_romney', 'colour', 'countriessim', 'faces5', 'tools_romney', 'lines_cohen', 'abstractnumbers', | ||
'countriesdis', 'animalnames11', 'faces_steyvers', 'weapons2_romney', 'texturebrodatz_heaps', | ||
'fish_romney', 'flowerpots', 'sizeangle_treat', 'clothing2_romney', 'weapons_romney', 'clothing_romney', | ||
'animalnames5', 'vegetables_romney', 'animalpictures11'] | ||
|
||
|
||
def fetch_similarity_matrix(name: str, data_home: Optional[os.PathLike] = None, download_if_missing: bool = True | ||
) -> Union[Bunch, np.ndarray]: | ||
""" Load human similarity judgements, aggregated to a similarity matrix. | ||
This function provides access to the following similarity matrices: | ||
`fruit2_romney, nonsense_romney, furniture_romney, kinship_kimrosenberg, rectangle_kruschke, vegetables2_romney, | ||
animalpictures5, auditory, druguse, faces11, fruits, dotpatterns, furniture2_romney, bodies_viken, | ||
textures, sport_romney, bankwiring, morsenumbers, faces_busey, letters, vehicles_romney, vehicles2_romney, | ||
birds_romney, fruit_romney, risks, morseall, texturemit_heaps, cartoonfaces, country_robinsonhefner, congress, | ||
phonemes, toys_romney, colour, countriessim, faces5, tools_romney, lines_cohen, abstractnumbers, countriesdis, | ||
animalnames11, faces_steyvers, weapons2_romney, texturebrodatz_heaps, fish_romney, flowerpots, sizeangle_treat, | ||
clothing2_romney, weapons_romney, clothing_romney, animalnames5, vegetables_romney, animalpictures11`. | ||
See :ref:`similarity_matrix_dataset` for a detailed description. | ||
>>> dataset = fetch_similarity_matrix('colour') # doctest: +REMOTE_DATA | ||
>>> dataset.labels[:2].tolist() # doctest: +REMOTE_DATA | ||
['434', '445'] | ||
>>> dataset.similarity.shape # doctest: +REMOTE_DATA | ||
(14, 14) | ||
Args: | ||
name: Name of the similarity dataset | ||
data_home : optional, default: None | ||
Specify another download and cache folder for the datasets. By default | ||
all scikit-learn data is stored in '~/scikit_learn_data' subfolders. | ||
download_if_missing : optional, default=True | ||
Returns: | ||
dataset : :class:`~sklearn.utils.Bunch` | ||
Dictionary-like object, with the following attributes. | ||
similarity : ndarray, shape (n_objects, n_objects) | ||
Symmetric matrix of normalized object similarities. | ||
None for some datasets. | ||
proximity : ndarray, shape (n_objects, n_objects) | ||
Symmetric matrix of normalized pairwise proximities. | ||
None for some datasets. | ||
n_objects: int | ||
Number of objects | ||
labels : (n_objects,) | ||
Single word describing each object | ||
sigma: float | ||
Uncertainty of the similarity values. | ||
Not available for all datasets. | ||
DESCR : string | ||
Description of the dataset. | ||
Raises: | ||
IOError: If the data is not locally available, but download_if_missing=False | ||
""" | ||
if name not in AVAILABLE_SIMILARITIES: | ||
raise ValueError(f"Unexpected similarity name = {name}. Use one of {AVAILABLE_SIMILARITIES}.") | ||
|
||
data_home = Path(_base.get_data_home(data_home=data_home)) | ||
if not data_home.exists(): | ||
data_home.mkdir() | ||
|
||
basepath = Path(_base._pkl_filepath(data_home, 'similarity_collection/')) | ||
filepath = basepath.joinpath(f'{name}.pkz') | ||
if not filepath.exists(): | ||
if not download_if_missing: | ||
raise IOError("Data not found and `download_if_missing` is False") | ||
|
||
logger.info('Downloading imagenet similarity data from {} to {}'.format(ARCHIVE.url, data_home)) | ||
|
||
archive_path = _base._fetch_remote(ARCHIVE, dirname=data_home) | ||
if not basepath.exists(): | ||
basepath.mkdir(parents=True) | ||
with zipfile.ZipFile(archive_path) as zf: | ||
for _this_name in AVAILABLE_SIMILARITIES: | ||
with zf.open(f'{_this_name}.mat', 'r') as f: | ||
_raw = scipy.io.loadmat(f) | ||
_this_dict = { | ||
'similarity': np.array(_raw.get('s', None)), | ||
'proximity': np.array(_raw.get('d', None)), | ||
'n_objects': int(_raw['n']), | ||
'labels': np.array(_raw['labs'], dtype=str), | ||
'sigma': float(_raw.get('sigma_emp', np.nan)), | ||
} | ||
_this_filepath = basepath.joinpath(f'{_this_name}.pkz') | ||
joblib.dump(_this_dict, _this_filepath, compress=6) | ||
if name == _this_name: | ||
data_dict = _this_dict | ||
os.remove(archive_path) | ||
else: | ||
data_dict = joblib.load(filepath) | ||
|
||
module_path = Path(__file__).parent | ||
with module_path.joinpath('descr', 'similarity_matrix_dataset.rst').open() as rst_file: | ||
fdescr = rst_file.read() | ||
|
||
return Bunch(**data_dict, | ||
DESCR=fdescr) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
.. _similarity_matrix_dataset: | ||
|
||
Similarity Judgement Matrix datasets | ||
------------------------------------ | ||
|
||
`This collection`_ provides similarity matrices from human similarity judgments on | ||
various different stimuli. The collection was aggregated and published by Michael Lee. | ||
|
||
.. _This collection: https://osf.io/ey9vp/ | ||
|
||
|
||
**Data Sets:** | ||
|
||
These are Michael Lee's descriptions of the datasets (with minor modifications): | ||
|
||
abstractnumbers | ||
Human judgments of the numbers 0-9. From research described in Shepard, R. N., Kilpatrick, D. W., & Cunningham, J. P. (1975). The internal representation of numbers. Cognitive Psychology, 7, 82-138 (with thanks to Josh Tenenbaum). | ||
auditory | ||
Auditory confusions of 25 letters (all excluding ‘o’) and the numbers 0-9. From research reported in Kuennapas, T., & Janson, A-J. (1969). Multidimensional Similarity of Letters. Perceptual and Motor Skills, 28, 3-12. | ||
bankwiring | ||
A sociologist’s judgment of the relationships between 14 bank wiring workers. From research reported in Roethlisberger, F. J., & Dickson, W. J. (1939). Management and the worker. Cambridge, MA: Harvard University Press. | ||
colours | ||
Human judgments of 14 colours, specified by their wavelengths. From research reported in Ekman, G. (1954). Dimensions of color vision. The Journal of Psychology, 38, 467-474. | ||
congress | ||
Voting patterns of 14 members of congress on environmental bills. From raw data presented in Romesburg, H. C. (1984). Cluster analysis for researchers. Belmont, CA: Lifetime Learning Publications. | ||
dotpatterns | ||
Human judgments of 17 dot patterns. From research reported in Glushko, R. J. (1975). Pattern goodness and redundancy revisited: Multidimensional scaling and hierarchical cluster analysis. Perception & Psychophysics, 17(2), 158-162. | ||
druguse | ||
Reported adolescent use of 13 drug types. From research reported in Huba, G. L., Wingard, J. A., & Bentler, P. M. (1981). A comparison of two latent variable causal models for adolescent drug use. Journal of Personality and Social Psychology, 40(1), 180-193. | ||
flowerpots | ||
Human judgments of 16 drawings of flowerpots. From research reported in Gati, I., & Tversky, A. (1982). Representations of qualitative and quantitative dimensions. Journal of Experimental Psychology: Human Perception and Performance, 8(2), 325-340. | ||
fruits | ||
Human judgments of 21 fruits. From research reported in Tversky, A., & Hutchinson, J. W. (1986). Nearest Neighbor Analysis of Psychological Spaces. Psychological Review, 93(1), 3-22. | ||
letters | ||
Kindergarten children’s judgment of perceptual similarity of the 26 capital letters. From research reported in Gibson, E. J., Osser, H., Schiff, W., & Smith, J. (1963). An analysis of critical features of letters, tested by a confusion matrix. Cooperative Research Project No. 639, U.S. Office of Education. | ||
morseall and morsenumbers | ||
Confusion of Morse code numerals and numeral and letters. From research reported in Rothkopf, E. Z. (1957). A measure of stimulus similarity and errors in some paired-associate learning tasks. Journal of Experimental Psychology, 53, 94-101. | ||
phonemes | ||
Auditory confusion of 16 consonant phonemes. From research reported in Miller, G. A., & Nicely, P. E. (1955). An analysis of perceptual confusions among some English consonants. Journal of the Acoustical Society of America, 27, 338-352. | ||
risks | ||
Human judgments of 18 risks. From research reported in Johnson, E. J., & Tversky, A. (1984). Representations of Perceptions of Risks. Journal of Experimental Psychology: General, 113(1), 55-70. | ||
rectangles | ||
Human judgments of 16 rectangles. From research described in Chapter 15 of Borg, I., & Lingoes, J. (1987). Multidimensional similarity structure analysis. New York: Springer Verlag. | ||
|
||
The following datasets contain also a empirical estimate of the precision of the similarity measurements: | ||
|
||
country_robinsonhefner | ||
Human judgments (in 1967) of 17 countries. From research reported in Robinson, J. P., & Hefner, R (1967). | ||
Multidimensional Differences in Public and Academic Perceptions of Nations. Journal of Personality and Social Psychology, 7(3), 251-259. | ||
rectangles_kruschke | ||
Human judgments of 8 rectangles with interior line segments. | ||
From research reported in Kruschke, J. K. (1993). Human category learning: Implications for backpropagation models. Connection Science, 5, 3-36. | ||
kinship_rosenbergkim | ||
Human judgments of 15 kinship terms. From research reported in Rosenberg, S., & Kim, M. P. (1975). | ||
The Method of Sorting as a Data-Generating Procedure in Multivariate Research. Multivariate Behavioral Research, 10, 489-502. | ||
romney name datasets: | ||
Human judgments of 21 bird names, 21 clothing names, 21 different clothing names, 21 fish names, | ||
21 fruit names, 21 different fruit names, 21 furniture names, 21 different furniture names, | ||
21 semantically unrelated words, 21 sport names, 21 tool names, 21 toy names, | ||
21 vegetable names, 21 different vegetable names, 21 vehicle names, 21 different vehicle names, | ||
21 weapon names, 21 different weapon names. | ||
All from research reported in Romney, A. K., Brewer, D. D., & Batchelder, W. H. (1993). | ||
Predicting Clustering from Semantic Structure. Psychological Science, 4(1), 28-34, with thanks to Devon Brewer. | ||
|
||
`birds_romney, clothing_romney, clothing2_romney, fish_romney, fruit_romney, fruit2_romney, | ||
furniture_romney, furniture2_romney, nonsense_romney, sport_romney, tools_romney, toys_romney, | ||
vegetables_romney, vegetables2_romney, vehicles_romney, vehicles2_romney, weapons_romney, and weapons2_romney`. | ||
lines_cohen, faces_busey, faces_steyvers, sizeangle_treat, and bodies_viken | ||
Human judgments of 9 lines of different lengths, 60 faces, 7 ‘morphed’ faces, 9 shapes varying in size and angle, 24 bodies varying in “affect and body size”. Mark Steyvers kindly provided Michael Lee with all of these. | ||
texturebrodatz_heaps and texturemit_heaps | ||
Human judgments of 30 Brodatz textures, and 24 MIT textures. Both from research reported in Heaps, C., & Handel, S. (1999). Similarity and Features of Natural Textures. Journal of Experimental Psychology: Human Perception and Performance, 25(2), 299-320. | ||
cartoonfaces, countriessim, and countriesdis | ||
Human judgments of 10 cartoon faces, and forced-choice judgments of 16 countries in a similarity condition and a dissimilarity condition. From the research described in Navarro, D.J., & Lee, M.D. (2004). Common and distinctive features in stimulus representation: A modified version of the contrast model. Psychonomic Bulletin & Review, 11(6), 961–974, and Navarro, D.J., & Lee, M.D. (2002). Commonalities and distinctions in featural stimulus representations. In W.G. Gray & C. D. Schunn, (Eds.), Proceedings of the 24th Annual Conference of the Cognitive Science Society, pp. 685-690. Mahwah, NJ: Erlbaum. | ||
animalpictures5, animalpictures11, and animalpictures21, | ||
Human judgments of 21 animals (presented as pictures on a 5 point scale), of 21 animals (presented as pictures on a 5 point scale), | ||
of 21 animals (presented as pictures on an 11 point scale). | ||
From (as yet; probably never-to-be) unreported research Michael Lee did a while back. | ||
animalnames5, animalnames11 | ||
Human judgments of 21 animals (presented as words on a 5 point scale), of 21 animals (presented as words on an 11 point scale) | ||
From (as yet; probably never-to-be) unreported research Michael Lee did a while back. | ||
faces5 and faces11 | ||
Human judgements of 25 faces (5 point scale), and of 25 faces (11 point scale). | ||
From (as yet; probably never-to-be) unreported research Michael Lee did a while back. | ||
|
||
|
||
Please cite the dataset's paper if you use a it in publications. | ||
|
||
These datasets can be downloaded using the :func:`cblearn.datasets.fetch_similarity_matrix` with the | ||
corresponding name parameter. Triplet trials can be generated by using 1 - the similarity matrix as a precomputed | ||
distance matrix: `cblearn.dataset.make_random_triplets(1 - data.similarity, distance='precomputed')`. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters