In [2]:
import json
import yaml
import string
import numpy as np
import pandas as pd
from IPython import display

import torch
from torch import nn
from torch import optim
from torch.utils.data import Dataset, Dataset

In [3]:
with open('hyperparameters.yaml') as f:
    hp = yaml.safe_load(f)['hyperparameters']

In [4]:
class ReviewDataset(Dataset):
    def __init__(self, df, vectorizer):
        print('ReviewDS class instantiated.')

        self.df = df
        self.vectorizer = vectorizer

        self.train_df = self.df[self.df["split"] == 'train']
        self.val_df = self.df[self.df["split"] == 'val']
        self.test_df = self.df[self.df["split"] == 'test']

        self.train_len = len(self.train_df)
        self.val_len = len(self.val_df)
        self.test_len = len(self.test_df)
        print(f'Train len: {self.train_len}\nVal len: {self.val_len}\nTest len: {self.test_len}')

        self.lookup_dict = {
            'train': (self.train_df, self.train_len),
            'val': (self.val_df, self.val_len),
            'test': (self.test_df, self.test_len),
        }

        self.set_split('train')

    def set_split(self, split):
        self.target_split = split
        self.target_df, self.target_len = self.lookup_dict[self.target_split]


    @classmethod
    def make_vectorizer(cls, csv):
        review_df = pd.read_csv(csv)
        train_df = review_df[review_df.split=='train']
        return cls(
            review_df
            , ReviewVectorizer.from_df(train_df)
        )
    
    @staticmethod
    def load_vectorizer(cls, csv, vectorizer_path):
        review_df = pd.read_csv(csv)
        with open vectorizer as f:
            vectorizer = ReviewVectorizer.from_serializable(json.load(f))
        return cls(
            review_df
            , vectorizer
        )
    
    def save_vectorizer(self, vectorizer_path):
        with open(vectorizer_path, 'w') as f:
            json.dump(self.vectorizer.to_serializable(), f)

    def get_vectorizer(self):
        return self.vectorizer
    
    def get_batch(self, batch_size):
        return len(self) // batch_size
    
    def __len__(self):
        return self.target_len

    def __getitem__(self, idx):
        row = self.target_df.iloc[idx]
        review_vector = self.vectorizer.vectorize(row['review'])
        rating_idx = self.vectorizer.rating_vocab.lookup_token(row['rating'])
        return {
            'X': review_vector,
            'y': rating_idx
        }
    
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"):
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [None]:
class ReviewVectorizer():
    def __init__(self, review_vocab, rating_vocab):
        self.review_vocab = review_vocab
        self.rating_vocab = rating_vocab

    def vectorize(self, text):
        encoding = np.zeros(len(self.review_vocab), dtype=np.float32)
        for word in text.split(' '):
            if word not in string.punctuation:
                encoding[self.review_vocab.lookup_token(word)] = 1
        return encoding

    def from_df(self, df, length):
        review_vocab = Vocabulary(UNK=True)
        rating_vocab = Vocabulary(UNK=False)
        for rating in sorted(set(df['rating'])):
            rating_vocab.add_token(rating)
        
        word_counts

    def from_serializable(self):
        pass

    def to_serializable(self):
        pass



In [None]:
class Vocabulary():
    def __init__(self):
        pass
    
    def to_serializable(self):
        pass
    
    def from_serializable(cls):
        pass
    
    def add_token(self):
        pass
    
    def add_tokens(self):
        pass
    
    def lookup_token(self):
        pass
    
    def lookup_index(self):
        pass
    
    def __len__(self):
        return(len(token_to_idx))