In [1]:
%pylab inline

import pandas as pd
from sklearn import metrics
import scipy.linalg 

import sys
sys.path.insert(0, "..")
import pictureweb.distributed.sharded_matrix as sharded_matrix
import pictureweb.distributed.distributed as D 
import pictureweb.utils.misc as misc
from pictureweb.conv.coates_ng_help import grab_patches
import boto3
import io
import concurrent.futures as fs
import itertools

Populating the interactive namespace from numpy and matplotlib


In [2]:
def convert_csv_to_matrix(fname, out_matrix_mmap, sidx, eidx, bucket="sift-lcs-fv-256"):
    out_matrix = out_matrix_mmap.load()
    val_lines = open(fname).readlines()
    features = np.array([[float(y) for y in x.split(",")[1:]] for x in val_lines])
    image_names = [x.split(",")[0] for x in val_lines]
    np.copyto(out_matrix[sidx:eidx, :], features)
    return image_names
    
def convert_csvs_to_matrix_parallel(fnames, out_matrix, bucket="sift-lcs-fv-256", workers=32):
    futures = []
    with fs.ProcessPoolExecutor(workers) as executor:
        sidx = 0
        for fname in fnames:
            num_lines = sum(1 for line in open(fname))
            eidx = sidx + num_lines
            future = executor.submit(convert_csv_to_matrix, out_matrix_mmap=out_matrix, fname=fname, bucket=bucket, sidx=sidx, eidx=eidx)
            sidx += num_lines
            futures.append(future)
        fs.wait(futures)
        all_image_names = zip(*[f.result() for f in futures])
        image_names = list(itertools.chain(*all_image_names))
        return out_matrix, image_names
        
        

In [4]:
X_train_mmap_data = np.memmap("/dev/shm/matrix", shape=(1281167, 65536), dtype="float64", mode="w+")

In [7]:
X_train_mmap = misc.MmapArray(X_train_mmap_data, mode="r+")

In [8]:
import os

In [9]:
fnames = ["/mnt/featuresTrain/{0}".format(i) for i in os.listdir("/mnt/featuresTrain/")]

In [None]:
%time features_mmap, fnames = convert_csvs_to_matrix_parallel(fnames, out_matrix=X_train_mmap, workers=48)

In [170]:
X_train_mmap_data

memmap([[ 0.        ,  0.        ,  0.        , ..., -0.0013374 ,
         0.00121143, -0.0011459 ],
       [ 0.        ,  0.        ,  0.        , ..., -0.00506777,
        -0.00563276, -0.00386098],
       [ 0.        ,  0.        ,  0.        , ..., -0.00381275,
        -0.00012364,  0.00184895],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [167]:
features[0]

array([ 0.        ,  0.        ,  0.        , ..., -0.00189959,
       -0.00322859,  0.00203046])