In [20]:
from sklearn.datasets import get_data_home
from os import remove, makedirs
from os.path import dirname, exists, join
from gzip import GzipFile
import numpy as np
import scipy.sparse as sp
import joblib

In [23]:
from sklearn.datasets._base import _pkl_filepath
from sklearn.datasets._base import _fetch_remote
from sklearn.datasets._base import RemoteFileMetadata
from sklearn.datasets._svmlight_format_io import load_svmlight_files
from sklearn.utils import shuffle as shuffle_
from sklearn.utils import Bunch
from sklearn.utils.validation import _deprecate_positional_args

In [21]:
N_SAMPLES = 804414
N_FEATURES = 47236
N_CATEGORIES = 103
N_TRAIN = 23149
data_home = get_data_home()
rcv1_dir = join(data_home, "RCV1")

In [24]:
samples_path = _pkl_filepath(rcv1_dir, "samples.pkl")
sample_id_path = _pkl_filepath(rcv1_dir, "sample_id.pkl")
sample_topics_path = _pkl_filepath(rcv1_dir, "sample_topics.pkl")
topics_path = _pkl_filepath(rcv1_dir, "topics_names.pkl")

In [25]:
files = []

In [32]:
files.append(GzipFile(filename=r"C:\Users\Articuly\scikit_learn_data\RCV1\lyrl2004_vectors_train.dat.gz"))

In [33]:
files

[<gzip _io.BufferedReader name='C:\\Users\\Articuly\\scikit_learn_data\\RCV1\\lyrl2004_vectors_test_pt0.dat.gz' 0x1d66ea3ccc8>,
 <gzip _io.BufferedReader name='C:\\Users\\Articuly\\scikit_learn_data\\RCV1\\lyrl2004_vectors_test_pt1.dat.gz' 0x1d66e384808>,
 <gzip _io.BufferedReader name='C:\\Users\\Articuly\\scikit_learn_data\\RCV1\\lyrl2004_vectors_test_pt2.dat.gz' 0x1d66f0890c8>,
 <gzip _io.BufferedReader name='C:\\Users\\Articuly\\scikit_learn_data\\RCV1\\lyrl2004_vectors_test_pt3.dat.gz' 0x1d66f075d88>,
 <gzip _io.BufferedReader name='C:\\Users\\Articuly\\scikit_learn_data\\RCV1\\lyrl2004_vectors_train.dat.gz' 0x1d66efaf408>]

In [34]:
Xy = load_svmlight_files(files, n_features=N_FEATURES)

In [35]:
X = sp.vstack([Xy[8], Xy[0], Xy[2], Xy[4], Xy[6]]).tocsr()
sample_id = np.hstack((Xy[9], Xy[1], Xy[3], Xy[5], Xy[7]))
sample_id = sample_id.astype(np.uint32, copy=False)

In [36]:
joblib.dump(X, samples_path, compress=9)
joblib.dump(sample_id, sample_id_path, compress=9)

['C:\\Users\\Articuly\\scikit_learn_data\\RCV1\\sample_id_py3.pkl']

In [37]:
n_cat = -1
n_doc = -1
doc_previous = -1
y = np.zeros((N_SAMPLES, N_CATEGORIES), dtype=np.uint8)
sample_id_bis = np.zeros(N_SAMPLES, dtype=np.int32)
category_names = {}

In [39]:
X = joblib.load(samples_path)
sample_id = joblib.load(sample_id_path)

In [38]:
topics_archive_path=r"C:\Users\Articuly\scikit_learn_data\RCV1\rcv1v2.topics.qrels.gz"

In [55]:
n_cat = -1
n_doc = -1
doc_previous = -1
y = np.zeros((N_SAMPLES, N_CATEGORIES), dtype=np.uint8)
sample_id_bis = np.zeros(N_SAMPLES, dtype=np.int32)
category_names = {}
with GzipFile(filename=topics_archive_path, mode='rb') as f:
    for line in f:
        line_components = line.decode("ascii").split(" ")
        if len(line_components) == 3:
            cat, doc, _ = line_components
            if cat not in category_names:
                n_cat += 1
                category_names[cat] = n_cat

            doc = int(doc)
            if doc != doc_previous:
                doc_previous = doc
                n_doc += 1
                sample_id_bis[n_doc] = doc
            y[n_doc, category_names[cat]] = 1

In [56]:
def _find_permutation(a, b):
    """find the permutation from a to b"""
    t = np.argsort(a)
    u = np.argsort(b)
    u_ = _inverse_permutation(u)
    return t[u_]

def _inverse_permutation(p):
    """inverse permutation p"""
    n = p.size
    s = np.zeros(n, dtype=np.int32)
    i = np.arange(n, dtype=np.int32)
    np.put(s, p, i)  # s[p] = i
    return s

In [57]:
# Samples in X are ordered with sample_id,
# whereas in y, they are ordered with sample_id_bis.
permutation = _find_permutation(sample_id_bis, sample_id)
y = y[permutation, :]

In [58]:
# save category names in a list, with same order than y
categories = np.empty(N_CATEGORIES, dtype=object)
for k in category_names.keys():
    categories[category_names[k]] = k

In [60]:
# reorder categories in lexicographic order
order = np.argsort(categories)
categories = categories[order]
y = sp.csr_matrix(y[:, order])

In [61]:
joblib.dump(y, sample_topics_path, compress=9)
joblib.dump(categories, topics_path, compress=9)

['C:\\Users\\Articuly\\scikit_learn_data\\RCV1\\topics_names_py3.pkl']

In [62]:
from sklearn.datasets import fetch_rcv1
news=fetch_rcv1()

In [69]:
print(news.DESCR)

.. _rcv1_dataset:

RCV1 dataset
------------

Reuters Corpus Volume I (RCV1) is an archive of over 800,000 manually 
categorized newswire stories made available by Reuters, Ltd. for research 
purposes. The dataset is extensively described in [1]_.

**Data Set Characteristics:**

    Classes                              103
    Samples total                     804414
    Dimensionality                     47236
    Features           real, between 0 and 1

:func:`sklearn.datasets.fetch_rcv1` will load the following 
version: RCV1-v2, vectors, full sets, topics multilabels::

    >>> from sklearn.datasets import fetch_rcv1
    >>> rcv1 = fetch_rcv1()

It returns a dictionary-like object, with the following attributes:

``data``:
The feature matrix is a scipy CSR sparse matrix, with 804414 samples and
47236 features. Non-zero values contains cosine-normalized, log TF-IDF vectors.
A nearly chronological split is proposed in [1]_: The first 23149 samples are
the training set. The last 7812

In [73]:
from sklearn.datasets import fetch_lfw_people
from sklearn.datasets import fetch_lfw_pairs

In [74]:
data_home = get_data_home()
lfw_home = join(data_home, "lfw_home")

In [76]:
data_folder_path = join(lfw_home, "lfw_funneled")

In [81]:
archive_path = r"C:\Users\Articuly\scikit_learn_data\lfw_home\lfw-funneled.gz"

In [82]:
import tarfile
tarfile.open(archive_path, "r:gz").extractall(path=lfw_home)

In [83]:
pairs=fetch_lfw_pairs()

In [87]:
print(pairs.DESCR)

.. _labeled_faces_in_the_wild_dataset:

The Labeled Faces in the Wild face recognition dataset
------------------------------------------------------

This dataset is a collection of JPEG pictures of famous people collected
over the internet, all details are available on the official website:

    http://vis-www.cs.umass.edu/lfw/

Each picture is centered on a single face. The typical task is called
Face Verification: given a pair of two pictures, a binary classifier
must predict whether the two images are from the same person.

An alternative task, Face Recognition or Face Identification is:
given the picture of the face of an unknown person, identify the name
of the person by referring to a gallery of previously seen pictures of
identified persons.

Both Face Verification and Face Recognition are tasks that are typically
performed on the output of a model trained to perform Face Detection. The
most popular model for Face Detection is called Viola-Jones and is
implemented in the OpenC

In [88]:
people=fetch_lfw_people()

In [None]:
print(people.DESCR)