In [1]:
import sys 
sys.path.append('/Users/lei/home/studyhall/smart-earth-sensing/lib') 
import warnings
warnings.filterwarnings('ignore') 
 
import os 
import re 
import pickle 
import numpy as np

import torch 
from torch.nn.functional import one_hot 
from torch.utils.data import Dataset
 
from tqdm import tqdm 
from matplotlib import pyplot as plt 
 
from sklearn.model_selection import train_test_split 

from utils import * 
from dataset import * 
from h5reader import * 
from codedump import * 
from augmentation import * 

In [17]:
soil_maps = (
    {'normal' : 0,
    'walk'    : 1,
    'dig'     : 2},
    {0 : 'nomal', 
    1  : 'walk', 
    2  : 'dig'}
) 
 
wall_maps = (
    {'normal' : 0,
    'shake'   : 1},
    {0 : 'nomal', 
    1  : 'shake'}
) 

h5_dir_path = "/Users/lei/home/studyhall/smart-earth-sensing/zhoujie/data/h5s" 
soil_spec_path = "/Users/lei/home/studyhall/smart-earth-sensing/zhoujie/data/specs_soil.txt" 
wall_spec_path = "/Users/lei/home/studyhall/smart-earth-sensing/zhoujie/data/specs_wall.txt" 
wall_locus_id = 700 
soil_locus_id = 570 
sr = 2000 
measurements4halfsec = 1000 
 
feature_set, label_set = h5s2raw(h5_dir_path, soil_locus_id, soil_maps, soil_spec_path) 
feature_set = np.array([list(div2chunks(raw, measurements4halfsec)) for raw in feature_set]).reshape(67 * 120, 1000, 1)
label_set = np.array([[label for i in range(0, sr * 60 // measurements4halfsec)] for label in label_set]).flatten()
print(np.shape(feature_set))
print(np.shape(label_set)) 

100%|██████████| 3/3 [00:47<00:00, 15.94s/it]

(8040, 1000, 1)
(8040,)





In [18]:
augmented_feature_set = []
augmented_label_set = [] 
for i in tqdm(range(0, len(feature_set))):
    audio = feature_set[i] 
    augmented_feature_set += [audio, 
                            pitch_shift(audio=audio, sr=sr, pitch_factor=2), 
                            time_shift(audio=audio, shift_max=50, shift_direction='rand'), 
                            noise_injection(audio=audio, noise_factor=0.01)] 
    augmented_label_set += [label_set[i] for j in range(0, 4)] 
print(np.shape(augmented_feature_set)) 
print(np.shape(augmented_label_set)) 
feature_set = np.array([[raw2mfcc(raw)] for raw in tqdm(augmented_feature_set)])
label_set = augmented_label_set 
print(np.shape(feature_set)) 
print(np.shape(label_set)) 

100%|██████████| 8040/8040 [23:40<00:00,  5.66it/s]


(32160, 1000, 1)
(32160,)


100%|██████████| 32160/32160 [01:27<00:00, 369.45it/s]


(32160, 1, 51, 64)
(32160,)


In [20]:
with open('../data/pk_files/soil_corpus.pkl', 'wb') as f:
    pickle.dump((feature_set, label_set), f) 

In [2]:
with open('../data/pk_files/soil_corpus.pkl', 'rb') as f:
    feature_set, label_set = pickle.load(f)

In [3]:
X_train, X_val, y_train, y_val = train_test_split(feature_set, label_set, test_size=0.3, random_state=52)
X_train = [torch.tensor(sample, dtype=torch.float32) for sample in X_train]
X_val = [torch.tensor(sample, dtype=torch.float32) for sample in X_val]
y_train = [one_hot(torch.tensor(sample, dtype=torch.long), num_classes=3).to(torch.float32) for sample in y_train]
y_val = [one_hot(torch.tensor(sample, dtype=torch.long), num_classes=3).to(torch.float32) for sample in y_val]
print("X_train len:", len(X_train)) 
print("y_train len:", len(y_train)) 
print("X_val len:", len(X_val)) 
print("y_val len:", len(y_val)) 
print(X_train[0].size()) 
print(X_val[0].size()) 
print(y_train[0].size()) 
print(y_val[0].size()) 
 

X_train len: 22512
y_train len: 22512
X_val len: 9648
y_val len: 9648
torch.Size([1, 51, 64])
torch.Size([1, 51, 64])
torch.Size([3])
torch.Size([3])


In [4]:
data_train = Zhoujie_Soil_Dataset(X_train, y_train, len(y_train), None)
data_val = Zhoujie_Soil_Dataset(X_val, y_val, len(y_val), None) 

In [5]:
with open('../data/pk_files/soil_datasets.pkl', 'wb') as f:
    pickle.dump((data_train, data_val), f)