In [9]:
from datasets import load_dataset, concatenate_datasets, DatasetDict
from tensorflow import keras as K
from keras import Input
from keras.models import Sequential 
from keras.layers import Dense, Embedding, LSTM, TextVectorization
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Dataset Preparation

We are going to use a sentiment analysis dataset from huggingface called
`winvoker/turkish-sentiment-analysis-dataset` (original link [here]()). The
dataset is consists of ~500000 rows, 3 columns (`text`, `label`, and `dataset`)
and split into a ratio of 90:10 for the train and test splits.

Using the `datasets` library, we can import the dataset using `load_dataset()`.
This enables us to import the whole dataset or just a certain split of the
dataset itself (meaning either the training or testing split). We can also slice
the split to get a certain amount or a certain range of indices from the split
(more about slicing the dataset split [here]())

In [15]:
DATASET_NAME = 'winvoker/turkish-sentiment-analysis-dataset'
ds_train = load_dataset(DATASET_NAME,
                  split='train[:5%]')
ds_test = load_dataset(DATASET_NAME, 
                       split='test[:5%]')
print(type(ds_train), '\n', type(ds_test))
ds_test

<class 'datasets.arrow_dataset.Dataset'> 
 <class 'datasets.arrow_dataset.Dataset'>


Dataset({
    features: ['text', 'label', 'dataset'],
    num_rows: 2448
})

Above is an example of loading a slice of the dataset from the training split (5% of the training split, to be precise). As we can see, it is an `arrow_dataset.Dataset` type object. 

But if we import the whole dataset, we can see that it is infact a `dataset_dict.DatasetDict` object. This is because the `load_dataset()` imports each split as a dictionary index. 

In [6]:
ds = load_dataset(DATASET_NAME)
print(type(ds))
ds
del ds

<class 'datasets.dataset_dict.DatasetDict'>


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'dataset'],
        num_rows: 440679
    })
    test: Dataset({
        features: ['text', 'label', 'dataset'],
        num_rows: 48965
    })
})

For our purposes, we want to split the dataset into three (train, validation, and test). As such, we have to split the training dataset using a 90:10 ratio to create a validation set.
Then create a new `DatasetDict` object based on the new 80:10:10 split.

In [16]:
ds_train = ds_train.remove_columns(column_names='dataset')
ds_test = ds_test.remove_columns(column_names='dataset')

In [17]:
ds_temp = ds_train.train_test_split(test_size=.1)

tf_ds_train = ds_temp['train'].to_tf_dataset(columns='text', 
                                             label_cols='label', 
                                             batch_size=64, 
                                             shuffle=True)
tf_ds_validation = ds_temp['test'].to_tf_dataset(columns='text', 
                                                label_cols='label', 
                                                batch_size=64, 
                                                shuffle=True)
tf_ds_test = ds_test.to_tf_dataset(columns='text', 
                                    label_cols='label', 
                                    batch_size=64, 
                                    shuffle=True)

ds_new = DatasetDict({
    'train': tf_ds_train,
    'validation': tf_ds_validation,
    'test': tf_ds_test,
})
ds_new

DatasetDict({
    train: <PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.string, name=None))>
    validation: <PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.string, name=None))>
    test: <PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.string, name=None))>
})

In [18]:
max_features = 10000
max_sequence_length = 250

vectorization_layer = TextVectorization(
    standardize='lower_and_strip_punctuation', 
    max_tokens=max_features, 
    output_mode='int', 
    output_sequence_length=max_sequence_length
    )

In [40]:
text_train = ds_new['train'].map(lambda x, y: (x,y))
# text_train
vectorization_layer.adapt(text_train.map(lambda x, y: x))

In [43]:
train_ds = vectorization_layer(ds_new['train'])
validation_ds = vectorization_layer(ds_new['validation'])
test_ds = vectorization_layer(ds_new['test'])

TypeError: 'PrefetchDataset' object is not subscriptable

In [None]:
model = Sequential([
    Input(shape=(None, ), dtype="string"), 





    Dense(3, activation="softmax")
])