In [2]:
from scipy.io import loadmat,savemat
import numpy as np
import glob
import random
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
types = ['kmno4', 'set1', 'set2', 'set5', 'set9']

# Extract data.

In [1]:
!tree data/

data/
├── kmno4
│   ├── kmno4_1.zip
│   └── kmno4_2.rar
├── set1
│   ├── set1_1.rar
│   └── set1_2.rar
├── set2
│   ├── set2_1.rar
│   └── set2_2.rar
├── set5
│   ├── set5_1.rar
│   └── set5_2.rar
└── set9
    ├── set9_1.rar
    └── set9_2.rar

5 directories, 10 files


In [None]:
!unzip data/kmno4/kmno4_1.zip

In [None]:
!unrar e data/kmno4/kmno4_2.rar data/kmo4/

In [None]:
%%bash
for i in 1 2 5 9;
do
for j in 1 2;
do
unrar e data/set$i/set$i\_$j.rar data/set$i/
done
done

# Merge scattering and transmittance data and seive uniques

In [4]:
def seive(chemtype,matfile):
    x=loadmat(matfile)
    chemid = types.index(chemtype)
    concentration_string=x['filename1'][0]
    if concentration_string=='outlier':
        return 0, 0
    elif not concentration_string.isnumeric():
        concentration_string = concentration_string[:2]
    concentration=int(concentration_string)
    scattering = x['scattering'][0][0][0]
    transmittance = x['direct'][0][0][0]
    merged = np.concatenate((scattering,transmittance), axis=1)
    uq_merged = np.unique(merged, axis=0)
    uq_count = uq_merged.shape[0]
    type_col = np.ones((uq_count,1))*chemid
    concentration_col = np.ones((uq_count,1))*concentration
    uq_merged = np.concatenate((uq_merged, type_col, concentration_col),axis=1)
    np.savetxt("seived/{}_{:03d}.csv".format(chemtype,concentration), uq_merged, delimiter=",")
    return uq_count, concentration

In [5]:
for chemtype in types:
    total=0
    for matfile in glob.glob('data/{}/*.mat'.format(chemtype)):
        try:
            uq_count, concentration=seive(chemtype,matfile)
            total+=uq_count
            print('%s Concentration = %d Count = %d'%(chemtype,concentration, uq_count))
        except Exception as e:
            print(e, matfile)
    print('%s total=%d'%(chemtype,total))

kmno4 Concentration = 35 Count = 32473
kmno4 Concentration = 40 Count = 34871
kmno4 Concentration = 185 Count = 36541
kmno4 Concentration = 95 Count = 28387
kmno4 Concentration = 115 Count = 30535
kmno4 Concentration = 140 Count = 33561
kmno4 Concentration = 145 Count = 31975
kmno4 Concentration = 125 Count = 31420
kmno4 Concentration = 65 Count = 32314
kmno4 Concentration = 110 Count = 32976
kmno4 Concentration = 165 Count = 18818
kmno4 Concentration = 105 Count = 30729
kmno4 Concentration = 155 Count = 32638
kmno4 Concentration = 45 Count = 35226
kmno4 Concentration = 160 Count = 24738
kmno4 Concentration = 120 Count = 30675
kmno4 Concentration = 180 Count = 32615
kmno4 Concentration = 175 Count = 17324
kmno4 Concentration = 90 Count = 28998
kmno4 Concentration = 170 Count = 26790
kmno4 Concentration = 70 Count = 32801
kmno4 Concentration = 30 Count = 33184
kmno4 Concentration = 50 Count = 32830
kmno4 Concentration = 190 Count = 33761
kmno4 Concentration = 55 Count = 30012
kmno4 Conc

In [6]:
!du -hs seived

2.2G	seived


# Shuffling data

> RAM intensive!

In [47]:
x=np.loadtxt(glob.glob('seived/*csv')[0],delimiter=',')

In [40]:
x.shape

(37663, 18)

In [43]:
np.random.shuffle(x)

In [44]:
y=np.unique(x,axis=0)

In [45]:
y.shape

(37663, 18)

In [46]:
y[:5]

array([[ 2.85806011e-02, -2.56563836e-03,  3.08240732e-02,
         2.89068085e-02,  2.77823068e-02,  2.78575462e-02,
         2.74313262e-02,  3.67847292e-02,  2.85806011e-02,
        -2.56563836e-03,  3.08240732e-02,  2.89068085e-02,
         2.77823068e-02,  3.80608951e-02,  2.74313262e-02,
         4.69869361e-02,  1.00000000e+00,  1.10000000e+02],
       [ 2.85806011e-02, -2.56563836e-03,  3.08240732e-02,
         2.89068085e-02,  2.77823068e-02,  2.78575462e-02,
         2.74313262e-02,  3.67847292e-02,  2.85806011e-02,
        -2.56563836e-03,  3.08240732e-02,  2.89068085e-02,
         3.79854788e-02,  2.78575462e-02,  2.74313262e-02,
         4.69869361e-02,  1.00000000e+00,  1.10000000e+02],
       [ 2.85806011e-02, -2.56563836e-03,  3.08240732e-02,
         2.89068085e-02,  2.77823068e-02,  2.78575462e-02,
         2.74313262e-02,  3.67847292e-02,  2.85806011e-02,
        -2.56563836e-03,  3.08240732e-02,  2.89068085e-02,
         3.79854788e-02,  2.78575462e-02,  3.76345569e

In [17]:
arr = np.array([1,2,3])

In [20]:
pp = np.array([0,2,1])

In [21]:
arr[pp]

array([1, 3, 2])

In [32]:
pp=np.arange(x.shape[0])

In [35]:
np.random.shuffle(pp)

In [36]:
pp[:5]

array([33208,  4192, 35312, 29557, 12025])

In [37]:
y=x[pp]

In [38]:
y[:5]

array([[ 2.85806011e-02, -2.56563836e-03,  8.17723574e-02,
         2.89068085e-02,  3.79854788e-02,  3.80608951e-02,
         3.76345569e-02,  3.67847292e-02,  2.85806011e-02,
        -2.56563836e-03,  3.08240732e-02,  2.89068085e-02,
         2.77823068e-02,  2.78575462e-02,  3.76345569e-02,
         3.67847292e-02,  1.00000000e+00,  1.10000000e+02],
       [ 3.44862905e-01,  7.63660205e-03,  9.19620142e-02,
         2.89068085e-02,  3.79854788e-02,  2.78575462e-02,
         3.76345569e-02,  4.69869361e-02,  2.36498859e+00,
        -2.56563836e-03,  3.08240732e-02,  2.89068085e-02,
         2.77823068e-02,  2.78575462e-02,  3.76345569e-02,
         3.67847292e-02,  1.00000000e+00,  1.10000000e+02],
       [ 2.85806011e-02,  7.63660205e-03,  5.12033869e-02,
         3.91079217e-02,  5.83918227e-02,  3.80608951e-02,
         3.76345569e-02,  3.67847292e-02,  3.87832560e-02,
         7.63660205e-03,  4.10137300e-02,  2.89068085e-02,
         4.89469534e+00,  2.78575462e-02,  3.76345569e

> Memory eating Cell bEllow!!!

In [3]:
for i,f in enumerate(glob.glob('seived/*csv')):
    if i==0:
        x = np.loadtxt(f, delimiter=',')
        
    else:
        x = np.concatenate((x,np.loadtxt(f, delimiter=',')), axis=0)
    print(x.shape)

(37663, 18)
(67609, 18)
(105258, 18)
(127309, 18)
(160019, 18)
(188331, 18)
(220411, 18)
(257173, 18)
(282636, 18)
(324226, 18)
(360186, 18)
(388093, 18)
(415751, 18)
(449558, 18)
(485815, 18)
(526538, 18)
(559722, 18)
(577500, 18)
(610924, 18)
(639737, 18)
(672375, 18)
(701373, 18)
(739300, 18)
(776931, 18)
(809615, 18)
(825840, 18)
(863804, 18)
(893245, 18)
(924379, 18)
(963131, 18)
(999810, 18)
(1035060, 18)
(1051519, 18)
(1091524, 18)
(1111435, 18)
(1155430, 18)
(1175680, 18)
(1210172, 18)
(1245398, 18)
(1266574, 18)
(1294290, 18)
(1336657, 18)
(1369470, 18)
(1373992, 18)
(1404527, 18)
(1440996, 18)
(1473310, 18)
(1509856, 18)
(1547922, 18)
(1578015, 18)
(1594211, 18)
(1624521, 18)
(1671086, 18)
(1695588, 18)
(1725311, 18)
(1744129, 18)
(1783496, 18)
(1813355, 18)
(1850149, 18)
(1872155, 18)
(1882281, 18)
(1921816, 18)
(1948911, 18)
(1986131, 18)
(2023392, 18)
(2060607, 18)
(2090912, 18)
(2125378, 18)
(2157851, 18)
(2191232, 18)
(2229373, 18)
(2242813, 18)
(2273542, 18)
(2312099, 1

`x` is in order. So we have to shuffle it.

In [4]:
shuffler = np.arange(x.shape[0])

In [5]:
np.random.shuffle(shuffler)

In [6]:
x = x[shuffler]

In [7]:
x[:5]

array([[ 2.85806011e-02,  7.63660205e-03,  3.08240732e-02,
         3.91079217e-02,  3.79854788e-02,  2.78575462e-02,
         2.74313262e-02,  3.67847292e-02,  2.85806011e-02,
        -2.56563836e-03,  3.08240732e-02,  2.89068085e-02,
         3.79854788e-02,  3.80608951e-02,  3.76345569e-02,
         4.69869361e-02,  1.00000000e+00,  6.50000000e+01],
       [ 2.85806011e-02,  7.63660205e-03,  3.08240732e-02,
         2.89068085e-02,  3.79854788e-02,  3.80608951e-02,
         3.76345569e-02,  3.67847292e-02,  3.87832560e-02,
        -2.56563836e-03,  3.08240732e-02,  2.89068085e-02,
         3.79854788e-02,  3.80608951e-02,  3.76345569e-02,
         4.69869361e-02,  0.00000000e+00,  1.95000000e+02],
       [ 6.20334588e-01, -2.56563836e-03,  3.08240732e-02,
         3.91079217e-02,  3.79854788e-02,  2.78575462e-02,
         3.76345569e-02,  3.67847292e-02,  6.27260543e+00,
         7.63660205e-03,  3.08240732e-02,  2.89068085e-02,
         3.79854788e-02,  3.80608951e-02,  3.76345569e

In [8]:
x.shape

(5059960, 18)

# Divide data into training, validation and testing and saving

In [19]:
total_data = x.shape[0]

In [23]:
train_size = int(total_data * 70 / 100)
train_size

3541972

In [24]:
val_size = int((total_data - train_size) / 2)
val_size

758994

In [25]:
test_size = val_size
test_size

758994

In [27]:
savemat('all.mat', {'All':x})

In [29]:
!du -hs shuffled

695M	shuffled


In [30]:
savemat('Final_data/data.mat', {'train':x[:train_size], 
                                'val':x[train_size:train_size+val_size], 
                                'test':x[train_size+val_size:]})

In [31]:
!du -hs Final_data

695M	Final_data


> First we considered rows of the data-stream individually. Now we set a window-size of N and take those rows together as input.

# Save data generator friendly

In [5]:
def seive(chemtype,matfile):
    x=loadmat(matfile)
    chemid = types.index(chemtype)
    concentration_string=x['filename1'][0]
    if concentration_string=='outlier':
        return 0, 0
    elif not concentration_string.isnumeric():
        concentration_string = concentration_string[:2]
    concentration=int(concentration_string)
    scattering = x['scattering'][0][0][0]
    transmittance = x['direct'][0][0][0]
    merged = np.concatenate((scattering,transmittance), axis=1)
    #uq_merged = np.unique(merged, axis=0)
    #uq_count = uq_merged.shape[0]
    count = merged.shape[0]
    #type_col = np.ones((uq_count,1))*chemid
    #type_col = np.ones((count,1))*chemid
    #concentration_col = np.ones((uq_count,1))*concentration
    #concentration_col = np.ones((count,1))*concentration
    #uq_merged = np.concatenate((uq_merged, type_col, concentration_col),axis=1)
    #merged = np.concatenate((merged, type_col, concentration_col),axis=1)
    #np.savetxt("seived/{}_{:03d}.csv".format(chemtype,concentration), uq_merged, delimiter=",")
    savemat('unsieved_concat/{}_{:03d}.mat'.format(chemtype, concentration),{'data':merged,
                                                                             'type':chemid, 
                                                                             'concentration':concentration})
    #return uq_count, concentration
    return count, concentration

In [6]:
for chemtype in types:
    total=0
    for matfile in glob.glob('data/{}/*.mat'.format(chemtype)):
        try:
            #uq_count, concentration=seive(chemtype,matfile)
            count, concentration=seive(chemtype,matfile)
            #total+=uq_count
            total+=count
            #print('%s Concentration = %d Count = %d'%(chemtype,concentration, uq_count))
            print('%s Concentration = %d Count = %d'%(chemtype,concentration, count))
        except Exception as e:
            print(e, matfile)
    print('%s total=%d'%(chemtype,total))

kmno4 Concentration = 35 Count = 56250
kmno4 Concentration = 40 Count = 56250
kmno4 Concentration = 185 Count = 56250
kmno4 Concentration = 95 Count = 56250
kmno4 Concentration = 115 Count = 56250
kmno4 Concentration = 140 Count = 56250
kmno4 Concentration = 145 Count = 56250
kmno4 Concentration = 125 Count = 56250
kmno4 Concentration = 65 Count = 56250
kmno4 Concentration = 110 Count = 56250
kmno4 Concentration = 165 Count = 56250
kmno4 Concentration = 105 Count = 56250
kmno4 Concentration = 155 Count = 56250
kmno4 Concentration = 45 Count = 56250
kmno4 Concentration = 160 Count = 56250
kmno4 Concentration = 120 Count = 56250
kmno4 Concentration = 180 Count = 56250
kmno4 Concentration = 175 Count = 56250
kmno4 Concentration = 90 Count = 56250
kmno4 Concentration = 170 Count = 56250
kmno4 Concentration = 70 Count = 56250
kmno4 Concentration = 30 Count = 56250
kmno4 Concentration = 50 Count = 56250
kmno4 Concentration = 190 Count = 56250
kmno4 Concentration = 55 Count = 56250
kmno4 Conc

In [4]:
m=loadmat('unsieved_concat/kmno4_045.mat')

In [5]:
m['data'].shape[]

(56250, 16)