In [14]:
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

<h3 style='color:green'>Retrieve review file paths in a tensorflow dataset</h3>

In [15]:
reviews_ds = tf.data.Dataset.list_files('reviews/*/*', shuffle = False)

In [16]:
for file in reviews_ds:
    print(file.numpy())

b'reviews/negative/neg_1.txt'
b'reviews/negative/neg_2.txt'
b'reviews/negative/neg_3.txt'
b'reviews/positive/pos_1.txt'
b'reviews/positive/pos_2.txt'
b'reviews/positive/pos_3.txt'


<h3 style='color:green'>Extract review text from these files. Extract label from folder name</h3>

In [17]:
import os
def extract_review_and_label(file_path):
    return tf.io.read_file(file_path), tf.strings.split(file_path, os.path.sep)[-2]
    # return (read the file), (read the label)     

In [18]:
reviews_ds_1 = reviews_ds.map(extract_review_and_label)
for review, label in reviews_ds_1:
    print("Review: ", review.numpy()[:50])
    print("Label: ", label.numpy())

Review:  b"Basically there's a family where a little boy (Jak"
Label:  b'negative'
Review:  b'This show was an amazing, fresh & innovative idea '
Label:  b'negative'
Review:  b''
Label:  b'negative'
Review:  b'One of the other reviewers has mentioned that afte'
Label:  b'positive'
Review:  b'A wonderful little production. <br /><br />The fil'
Label:  b'positive'
Review:  b''
Label:  b'positive'


<h3 style='color:green'>Filter blank reviews</h3>

In [19]:
reviews_ds_2 = reviews_ds_1.filter(lambda review, label: review != "")
for review, label in reviews_ds_2.as_numpy_iterator():
    print("Review: ", review[:50])
    print("Label: ", label)

Review:  b"Basically there's a family where a little boy (Jak"
Label:  b'negative'
Review:  b'This show was an amazing, fresh & innovative idea '
Label:  b'negative'
Review:  b'One of the other reviewers has mentioned that afte'
Label:  b'positive'
Review:  b'A wonderful little production. <br /><br />The fil'
Label:  b'positive'


<h3 style='color:green'>Perform map, filter and shuffle all in single line of code</h3>

In [20]:
final_ds = reviews_ds.map(extract_review_and_label).filter(lambda review, label: review != "").shuffle(3)
for review, label in final_ds.as_numpy_iterator():
    print("Review: ", review[:50])
    print("Label: ", label)

Review:  b'This show was an amazing, fresh & innovative idea '
Label:  b'negative'
Review:  b"Basically there's a family where a little boy (Jak"
Label:  b'negative'
Review:  b'One of the other reviewers has mentioned that afte'
Label:  b'positive'
Review:  b'A wonderful little production. <br /><br />The fil'
Label:  b'positive'
