#### **Implementing Dataset and Dataloder**

Organizing and accessing our text data efficiently is crucial in text processing pipelines. \
We can implement this by extending pytorch's `Dataset` class in our own `TextDataset` class.

In [33]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, text):
        self.text = text
    
    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        return self.text[idx]

Instantiating TextDataset with the encoded text and creating a dataloader.

In [None]:
dataset = TextDataset(encoded_text)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

#### **Pre-processing using helper functions**

Preprocessing functions lets us combine text preprocessing methods into a single helper function. \
We can customize the preprocessing function to include specific techniquies depending on the problem.

In [49]:
from torchtext.data.utils import get_tokenizer
from sklearn.feature_extraction.text import CountVectorizer
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
import regex as re

def preprocess_sentences(sentences):
    processed_sentences = []
    for sentence in sentences:
        sentence = sentence.lower()
        tokenizer = get_tokenizer('basic_english')
        tokens = tokenizer(sentence)
        # print("tokens:",tokens)       # Print statements for debugging
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]
        # print("stop words removed tokens:",tokens)
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
        # print("stemmed tokens:",tokens)
        freq_dist = FreqDist(tokens)
        threshold = 0   # commonly used threshold is 2
        tokens = [token for token in tokens if freq_dist[token] > threshold]
        # print("freq dist filtered tokens:",tokens)
        processed_sentences.append(' '.join(tokens))
        # print("processed sentences:", processed_sentences)
    return processed_sentences

def encode_sentences(sentences):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(sentences)
    encode_sentences = X.toarray()
    return encode_sentences, vectorizer

def extract_sentences(data):
    sentences = re.findall(r'[A-Z][^.!?]*[.!?]', data)
    return sentences

#### **Text Processing Pipeline**

In [50]:
def text_processing_pipeline(text):
    tokens = preprocess_sentences(text)
    encoded_sentences, vectorizer = encode_sentences(tokens)
    dataset = TextDataset(encoded_sentences)
    dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
    return dataloader, vectorizer

#### **Applying Text Processing Pipeline**

In [51]:
text_data = "This is the first sentence text contating first sentence of the text data. And here is another one contaninig one other text sentence."
# sentences = extract_sentences(text_data)
sentences = text_data.split(". ")
# dataloaders, vectorizer = [text_processing_pipeline(text) for text in sentences]
dataloaders, vectorizer = text_processing_pipeline(sentences)

print(next(iter(dataloaders))[0, :10])

tensor([0, 0, 1, 1, 2, 0, 2, 2])
