In [None]:
import tensorflow as tf

In [1]:
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function

import os

from functools import partial

import numpy as np
import pandas
import tensorflow as tf
import datetime

from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio

from util.config import Config
from util.text import text_to_char_array


def read_csvs(csv_files):
    source_data = None
    for csv in csv_files:
        file = pandas.read_csv(csv, encoding='utf-8', na_filter=False)
        #FIXME: not cross-platform
        csv_dir = os.path.dirname(os.path.abspath(csv))
        file['wav_filename'] = file['wav_filename'].str.replace(r'(^[^/])', lambda m: os.path.join(csv_dir, m.group(1))) # pylint: disable=cell-var-from-loop
        if source_data is None:
            source_data = file
        else:
            source_data = source_data.append(file)
    return source_data


def samples_to_mfccs(samples, sample_rate):
    spectrogram = contrib_audio.audio_spectrogram(samples,
                                                  window_size=Config.audio_window_samples,
                                                  stride=Config.audio_step_samples,
                                                  magnitude_squared=True)
    mfccs = contrib_audio.mfcc(spectrogram, sample_rate, dct_coefficient_count=Config.n_input)
    mfccs = tf.reshape(mfccs, [-1, Config.n_input])

    return mfccs, tf.shape(mfccs)[0]


def audiofile_to_features(wav_filename,noise_filename=None):
    if noise_filename:
        samples = tf.read_file(wav_filename)
        noise_samples = tf.read_file(noise_filename)
        noise_decoded = contrib_audio.decode_wav(noise_samples, desired_channels=1)
        if len(decoded.audio)>len(noise_decoded) and decoded.sample_rate!=noise_decoded.sample_rate:
            decoded_audio=decoded.audio
        else:
            decoded_audio=decoded.audio+noise_decoded[:len(decoded.audio)]
        features, features_len = samples_to_mfccs(decoded_audio, decoded.sample_rate)
    else:
        samples = tf.read_file(wav_filename)
        decoded = contrib_audio.decode_wav(samples, desired_channels=1)
        features, features_len = samples_to_mfccs(decoded.audio, decoded.sample_rate)

    return samples


def entry_to_features(wav_filename,noise_filename, transcript):
    # https://bugs.python.org/issue32117
    features, features_len = audiofile_to_features(wav_filename,noise_filename)
    return features, features_len, tf.SparseTensor(*transcript)


def to_sparse_tuple(sequence):
    r"""Creates a sparse representention of ``sequence``.
        Returns a tuple with (indices, values, shape)
    """
    indices = np.asarray(list(zip([0]*len(sequence), range(len(sequence)))), dtype=np.int64)
    shape = np.asarray([1, len(sequence)], dtype=np.int64)
    return indices, sequence, shape


def create_dataset(csvs, batch_size, cache_path=''):
    df = read_csvs(csvs)
    df.sort_values(by='wav_filesize', inplace=True)

    # Convert to character index arrays
    df['transcript'] = df['transcript'].apply(partial(text_to_char_array, alphabet=Config.alphabet))

    def generate_values():
        if "noise_filename" in df.columns:                        
            for _, row in df.iterrows():
                yield row.wav_filename,row.noise_filename,to_sparse_tuple(row.transcript)
        else:
            for _, row in df.iterrows():
                yield row.wav_filename,to_sparse_tuple(row.transcript)                                  

    # Batching a dataset of 2D SparseTensors creates 3D batches, which fail
    # when passed to tf.nn.ctc_loss, so we reshape them to remove the extra
    # dimension here.
    def sparse_reshape(sparse):
        shape = sparse.dense_shape
        return tf.sparse.reshape(sparse, [shape[0], shape[2]])

    def batch_fn(features, features_len, transcripts):
        features = tf.data.Dataset.zip((features, features_len))
        features = features.padded_batch(batch_size,
                                         padded_shapes=([None, Config.n_input], []))
        transcripts = transcripts.batch(batch_size).map(sparse_reshape)
        return tf.data.Dataset.zip((features, transcripts))

    num_gpus = len(Config.available_devices)

    dataset = (tf.data.Dataset.from_generator(generate_values,
                                              output_types=(tf.string, (tf.int64, tf.int32, tf.int64)))
                              .map(entry_to_features, num_parallel_calls=tf.data.experimental.AUTOTUNE)
                              .cache(cache_path)
                              .window(batch_size, drop_remainder=True).flat_map(batch_fn)
                              .prefetch(num_gpus))

    return dataset

def secs_to_hours(secs):
    hours, remainder = divmod(secs, 3600)
    minutes, seconds = divmod(remainder, 60)
    return '%d:%02d:%02d' % (hours, minutes, seconds)


In [None]:
def audiofile_to_features(wav_filename,noise_filename=None):
    if noise_filename is not None:
        samples = tf.read_file(wav_filename)
        noise_samples = tf.read_file(noise_filename)
        decoded = contrib_audio.decode_wav(samples, desired_channels=1)
        noise_decoded = contrib_audio.decode_wav(noise_samples, desired_channels=1)
        '''
        if len(decoded.audio)>len(noise_decoded.audio) and decoded.sample_rate != noise_decoded.sample_rate:
            decoded_audio=decoded.audio
        else:
            decoded_audio = tf.add(decoded.audio+noise_decoded.audio[:len(decoded.audio.eval())])
            features, features_len = samples_to_mfccs(decoded_audio, decoded.sample_rate)
            print("failed if noise")
        '''
        print("SAMPLE RTATE SIZE::-------------")
        print(tf.size(decoded.sample_rate))
        print(tf.size(noise_decoded.sample_rate))

        if_true_cond = tf.cond(tf.size(decoded.audio) > tf.size(noise_decoded.audio), lambda : decoded.audio, lambda : tf.add(decoded.audio, tf.slice(noise_decoded.audio,[0,0],tf.unstack(tf.shape(decoded.audio)))))
        
        print(tf.shape(tf.slice(noise_decoded.audio,[0,0],tf.unstack(tf.shape(decoded.audio)))))
        print(tf.shape(decoded.audio))
        
        decoded_audio = tf.cond(tf.equal(decoded.sample_rate, noise_decoded.sample_rate), lambda: if_true_cond, lambda: decoded.audio)
        decoded_audio = tf.identity(decoded_audio, name="input_with_noise_audio")
        features, features_len = samples_to_mfccs(decoded_audio, decoded.sample_rate)
        
    else:
        samples = tf.read_file(wav_filename)
        decoded = contrib_audio.decode_wav(samples, desired_channels=1)
        features, features_len = samples_to_mfccs(decoded.audio, decoded.sample_rate)

    return features, features_len

'/home/ubuntu/projects/datasets/LibriSpeech/test-clean-wav/7127-75947-0000.wav'

In [None]:
audiofile_to_features(read_csvs([csv_path])["wav_filename"].iloc[:1][0])

In [15]:

samples = tf.read_file("/home/ubuntu/projects/datasets/LibriSpeech/test-clean-wav/7127-75947-0000.wav")
decoded = contrib_audio.decode_wav(samples, desired_channels=1)
samples1 = tf.read_file("/home/ubuntu/projects/datasets/noise_wav/00noice.wav")
noise_decoded = contrib_audio.decode_wav(samples1, desired_channels=1)

print(tf.size(decoded.sample_rate))
print(tf.size(noise_decoded.sample_rate))

if_true_cond = tf.cond(tf.size(decoded.audio) > tf.size(noise_decoded.audio), lambda : decoded.audio, lambda : tf.add(decoded.audio, tf.slice(noise_decoded.audio,[0,0],tf.unstack(tf.shape(decoded.audio)))))

print(tf.shape(tf.slice(noise_decoded.audio,[0,0],tf.unstack(tf.shape(decoded.audio)))))
print(tf.shape(decoded.audio))

decoded_audio = tf.cond(tf.equal(decoded.sample_rate, noise_decoded.sample_rate), lambda: if_true_cond, lambda: decoded.audio)
decoded_audio = tf.identity(decoded_audio, name="input_with_noise_audio")


sess = tf.Session()
with sess.as_default():
    print(type(decoded.sample_rate.eval()),type(noise_decoded.sample_rate.eval()))
    print(np.dtype(decoded.audio.eval()[0][0]))
    print(len(decoded.audio.eval()),len(noise_decoded.audio.eval()),len(decoded_audio.eval()))
    print(decoded.audio.eval().ravel().reshape((len(noise_decoded.audio.eval().ravel()),1)))

Tensor("Size_1:0", shape=(), dtype=int32)
Tensor("Size_2:0", shape=(), dtype=int32)
Instructions for updating:
Colocations handled automatically by placer.
Tensor("Shape_1:0", shape=(2,), dtype=int32)
Tensor("Shape_2:0", shape=(2,), dtype=int32)


In [13]:
#from __future__ import print_function
#tf.Print(decoded.sample_rate)

sess = tf.Session()
with sess.as_default():
    print(type(decoded.sample_rate.eval()),type(noise_decoded.sample_rate.eval()))
    print(np.dtype(decoded.audio.eval()[0][0]))
    print(len(decoded.audio.eval()),len(noise_decoded.audio.eval()),len(decoded_audio.eval()))
    print(decoded.audio.eval().ravel().reshape((len(noise_decoded.audio.eval().ravel()),1)))
    
# with sess.as_default():
#     print(decoded1.sample_rate.eval())
#     print(type(decoded1.audio.eval()))
#     print(len(decoded1.audio.eval()))
#     print(decoded.audio.eval().ravel()+decoded1.audio.eval().ravel()[:len(decoded.audio.eval())])

<class 'numpy.int32'> <class 'numpy.int32'>
float32
287520 2850048 287520


ValueError: cannot reshape array of size 287520 into shape (2850048,1)

In [None]:
def float_samples_to_int16(y):
    """Convert floating-point numpy array of audio samples to int16."""
    if not issubclass(y.dtype.type, np.floating):
        raise ValueError('input samples not floating-point')
    return (y * np.iinfo(np.int16).max).astype(np.int16)

In [None]:
sess = tf.InteractiveSession()  
with sess.as_default():
    print(decoded1.sample_rate.eval())
    print(decoded1.audio.eval())
    print(len(decoded1.audio.eval()))
    print(decoded.audio.eval().ravel()+decoded1.audio.eval().ravel()[:len(decoded.audio.eval())])
    print(samples_to_wav_data(decoded.audio.eval().ravel()+decoded1.audio.eval().ravel()[:len(decoded.audio.eval())],16000))

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math
import tempfile
import librosa
import numpy as np
import scipy
import six
def samples_to_wav_data(samples, sample_rate):
    """Converts floating point samples to wav data."""
    wav_io = six.BytesIO()
    scipy.io.wavfile.write("ddddddddddddd.wav", sample_rate, float_samples_to_int16(samples))
    return wav_io.getvalue()

# Merge CVS with Audio augumentation 


In [1]:
import pandas as pd
import glob


In [2]:
root="/home/ubuntu/projects/datasets"
csv_paths=glob.glob(root+"/*.csv")
csv_paths

['/home/ubuntu/projects/datasets/cv-valid-dev.csv',
 '/home/ubuntu/projects/datasets/librivox-dev-other.csv',
 '/home/ubuntu/projects/datasets/cv-valid-test.csv',
 '/home/ubuntu/projects/datasets/ldc93s1.csv',
 '/home/ubuntu/projects/datasets/ted-test.csv',
 '/home/ubuntu/projects/datasets/librivox-train-other-500.csv',
 '/home/ubuntu/projects/datasets/voxforge-test.csv',
 '/home/ubuntu/projects/datasets/cv-valid-train.csv',
 '/home/ubuntu/projects/datasets/cv-other-dev.csv',
 '/home/ubuntu/projects/datasets/voxforge-dev.csv',
 '/home/ubuntu/projects/datasets/ted-dev.csv',
 '/home/ubuntu/projects/datasets/librivox-dev-clean.csv',
 '/home/ubuntu/projects/datasets/librivox-test-other.csv',
 '/home/ubuntu/projects/datasets/cv-other-train.csv',
 '/home/ubuntu/projects/datasets/cv-other-test.csv',
 '/home/ubuntu/projects/datasets/voxforge-train.csv',
 '/home/ubuntu/projects/datasets/cv-invalid.csv',
 '/home/ubuntu/projects/datasets/librivox-train-clean-360.csv',
 '/home/ubuntu/projects/data

In [19]:
train_csv=[]
test_csv=[]
val_csv=[]
for i in csv_paths:
    if "valid-train" in i:
        train_csv.append(i)
    if 'train-clean' in i:
        train_csv.append(i)
    if "ted-train" in i:
        train_csv.append(i)
#     if "voxforge-train" in i:
#         train_csv.append(i)
    if "test" in i:
        test_csv.append(i)
    if "dev" in i:
        val_csv.append(i)
train_csv

['/home/ubuntu/projects/datasets/cv-valid-train.csv',
 '/home/ubuntu/projects/datasets/librivox-train-clean-360.csv',
 '/home/ubuntu/projects/datasets/librivox-train-clean-100.csv',
 '/home/ubuntu/projects/datasets/ted-train.csv']

In [7]:
# reading csv file  
import random

def merge_csv_with_noise(csv_list,csv_type,noise_dir=None):

    for j in range(len(csv_list)):
        if j>0:
            df=pd.read_csv(csv_list[j])
            df1= df1.append(df, ignore_index=True)
        else:
            df1=pd.read_csv(csv_list[j])
    if noise_dir:
        noise_files=glob.glob(noise_dir+"/*.wav")
        df1['noise_filename'] = df1.apply(lambda x: random.choice(noise_files), axis=1)
    else:
        print("noise_dir not found: merge without noise file")
        
    df1.to_csv(csv_type+"_csv_final.csv", index=False)
    print(len(df1))
    #print(df1.iloc[:48])
    

    

In [20]:
noise_dir="/home/ubuntu/projects/datasets/noise_data_30_sec"
merge_csv_with_noise(train_csv,"train",noise_dir)

418810


In [None]:
df = pd.read_csv('/home/ubuntu/projects/datasets/ted-test.csv')
df

In [None]:
from pathlib import Path
for i in range(len(df)):
    my_file = Path(df.iloc[i]["wav_filename"])
    if my_file.is_file():
        pass
    else:
        print(i,df.iloc[0]["wav_filename"])

In [None]:
df.iloc[0]["wav_filename"]

# List of noise fiel

In [None]:
noise_files=glob.glob("/home/ubuntu/projects/datasets/noise_data_30_sec/*.wav")

In [None]:
def fxy(noise_files):
    return 
df1['noise_filename'] = df1.apply(lambda x: random.choice(noise_files), axis=1)

In [None]:
df1.isnull().sum(axis = 0)

In [None]:
 
import tensorflow as tf
import numpy as np
 
trainX = np.linspace(-1, 1, 101)
trainY = 3 * trainX + np.random.randn(*trainX.shape) * 0.33

In [None]:

X = tf.placeholder("float")
Y = tf.placeholder("float")

In [None]:

w = tf.Variable(0.0, name="weights")
y_model = tf.multiply(X, w)
 
cost = (tf.pow(Y-y_model, 2))
train_op = tf.train.GradientDescentOptimizer(0.01).minimize(cost)

In [None]:
init= tf.global_variables_initializer()

In [None]:
 with tf.Session() as sess:
    sess.run(init)
    for i in range(100):
        for (x, y) in zip(trainX, trainY):
            sess.run(train_op, feed_dict={X: x, Y: y})
    print(sess.run(w))

In [None]:
# x has shape [2, 3, 2]
x = tf.constant([[[1., 2.], [3., 4. ], [5. , 6. ]],
                 [[7., 8.], [9., 10.], [11., 12.]]])

# Extracts x[0, 1:2, :] == [[[ 3.,  4.]]]
res = tf.slice(x, [0, 1, 0], [1, 1, -1])


In [None]:
res

In [None]:
 with tf.Session() as sess:
        sess.run(res)
        print(res.eval())

In [None]:
wav_filename = '/home/ubuntu/projects/datasets/LibriSpeech/test-clean-wav/7127-75947-0000.wav'
noise_filename = "/home/ubuntu/projects/datasets/noise_wav/00noice.wav"
!sox --i "/home/ubuntu/projects/datasets/noise_wav/00noice.wav"

In [None]:
samples = tf.read_file(wav_filename)
noise_samples = tf.read_file(noise_filename)
decoded = contrib_audio.decode_wav(samples, desired_channels=1)
noise_decoded = contrib_audio.decode_wav(noise_samples, desired_channels=1)

In [None]:
#tf.size(decoded.audio) > tf.size(noise_decoded.audio) 

#tf.greater(tf.size(decoded.audio) , tf.size(noise_decoded.audio) )

# and tf.size(decoded.sample_rate) != tf.size(noise_decoded.sample_rate)
#decoded_audio = tf.cond(tf.size(decoded.audio) > tf.size(noise_decoded.audio), lambda : decoded.audio, lambda : tf.add(decoded.audio+noise_decoded.audio[:len(decoded.audio.eval())]))


#transcript = "and you know it"
#tf.SparseTensor(*transcript)
#transcript = tf.constant(transcript)
#tf.SparseTensor(*transcript)
tf.math.equal(decoded.sample_rate , noise_decoded.sample_rate)

In [1]:
import tensorflow as tf
from DeepSpeech import create_inference_graph
from util.config import  Config, initialize_globals
from util.flags import create_flags, FLAGS


In [2]:
sess = tf.InteractiveSession()

In [2]:
create_flags()
#initialize_globals()

In [3]:
initialize_globals()

UnrecognizedFlagError: Unknown command line flag 'f'

In [None]:
#with tf.Session(config=Config.session_config) as session:
inputs, outputs, _ = create_inference_graph(batch_size=1, n_steps=-1)

saver = tf.train.Saver()