# Recurrent Units and Embedding  🚮
    - Word2Vec & GloVe embeddings 🔥
    - LSTM and GRU units for learning 🧠
    - Attention modules provided 👀 
    - proprocess (doesn't help)

# Imports

In [1]:
# generic
import os
import glob

# Data management
import csv
import pandas as pd
import pickle
from torch.utils.data import Dataset

#generic nlp
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from spellchecker import SpellChecker
import string

# custom embeddings
import torchtext.vocab as vocab
from gensim.models import word2vec
import gensim

# Deep learning models
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

# Math and plots
import numpy as np
import random
from sklearn.utils import shuffle
import matplotlib.pyplot as plt

# Flag

In [2]:
# proprocessing
REGEX = True
SPELL_CHECK = False
STOP_WORDS = False
LEMMATIZE = False
STEM = False

# embedding type
WORD2VEC = False
GLOVE = True
TOKENIZED = False   # ngram space and back -> unorder and remove words out of set
WORD_CUTOFF = 0
MAX_CUTOFF = 1.0

# model params
HIDDEN_SIZE = 128
SENTENCE_SIZE = 200
CELL_TYPE ='LSTM'
NUM_LAYERS = 1
WEIGHT_FREEZE = False

# EMB hyper params
GLOVE_SIZE = 300
WINDOW_SIZE = 3
VEC_SIZE = 300
MIN_COUNT = WORD_CUTOFF

# GPU

In [3]:
device = torch.device("cuda")

# Data Loader

In [4]:
# filepaths
train_data = './Data/reddit_train.csv'
test_path = './Data/reddit_test.csv'

# gloabal labels
labels = ['hockey', 'nba', 'leagueoflegends', 'soccer', \
          'funny', 'movies', 'anime', 'Overwatch', 'trees', \
          'GlobalOffensive', 'nfl', 'AskReddit', 'gameofthrones', \
          'conspiracy', 'worldnews', 'wow', 'europe', 'canada', \
          'Music', 'baseball']

## Cleaning and Preprocessing

In [5]:
#load
comment_data = pd.read_csv(train_data)

#load
test_data = pd.read_csv(test_path)

In [6]:
tt = TweetTokenizer()

if REGEX:
    #clean
    comment_data['prep'] = comment_data['comments'].str.replace(r'[^\w\s]+', ' ')
    comment_data['prep'] = comment_data['prep'].str.lower()
    comment_data['prep'] = comment_data['prep'].str.replace('(\d+)', ' num ')
    comment_data['prep'] = comment_data['prep'].str.replace(r'http(?<=http).*', ' wasurl ')
    comment_data['prep'] = comment_data['prep'].str.replace(r'\s+', " ")
    comment_data['prep'] = comment_data['prep'].str.replace(" +", " ")
    
    #clean   
    test_data['prep'] = test_data['comments'].str.replace(r'[^\w\s]+', ' ')
    test_data['prep'] = test_data['prep'].str.lower()
    test_data['prep'] = test_data['prep'].str.replace('(\d+)', ' num ')
    test_data['prep'] = test_data['prep'].str.replace(r'http(?<=http).*', ' wasurl ')
    test_data['prep'] = test_data['prep'].str.replace(r'\s+', " ")
    test_data['prep'] = test_data['prep'].str.replace(" +", " ")
    

In [7]:
print(comment_data['prep'][0])

honestly buffalo is the correct answer i remember people somewhat joking that buffalo s mantra for starting goalies was win a game get traded i think edmonton s front office was a travesty for the better part of num years but buffalo s systematic destruction of the term competitive was much more responsible for the change to the draft lottery 


In [8]:
if SPELL_CHECK:
    #spellcheck
    spell = SpellChecker(distance=1)
    def spellcheck_col(row):
        row = tt.tokenize(row)
        return " ".join([spell.correction(word) for word in row])

    comment_data['prep'] = comment_data.prep.apply(spellcheck_col)
    test_data['prep'] = test_data.prep.apply(spellcheck_col)

if STOP_WORDS:
    # stopwords
    stop = stopwords.words('english')
    comment_data['prep'] = comment_data.prep.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    test_data['prep'] = test_data.prep.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

if LEMMATIZE:
    # lemmatization
    lemmatizer = WordNetLemmatizer()
    def lemmatize_col(row):
        row = tt.tokenize(row)
        return " ".join([lemmatizer.lemmatize(w) for w in row])

    comment_data['prep'] = comment_data.prep.apply(lemmatize_col)
    test_data['prep'] = test_data.prep.apply(lemmatize_col)

if STEM:
    #stemmer
    stemmer = PorterStemmer()
    def stem_col(row):
        row = tt.tokenize(row)
        return " ".join([stemmer.stem(word) for word in row])

    comment_data['prep'] = comment_data.prep.apply(stem_col)
    test_data['prep'] = test_data.prep.apply(stem_col)


In [9]:
print(comment_data['prep'][0])

honestly buffalo is the correct answer i remember people somewhat joking that buffalo s mantra for starting goalies was win a game get traded i think edmonton s front office was a travesty for the better part of num years but buffalo s systematic destruction of the term competitive was much more responsible for the change to the draft lottery 


# Data Embedding

In [10]:
#split data
clean_data = comment_data['prep'].to_numpy()
clean_labels = comment_data['subreddits'].to_numpy()

training_data = clean_data[:65000]
testing_data = clean_data[65000:]
training_labels = clean_labels[:65000]
testing_labels = clean_labels[65000:]

print(training_data.shape)
print(testing_data.shape)
print(training_labels.shape)
print(testing_labels.shape)

(65000,)
(5000,)
(65000,)
(5000,)


In [11]:
# tokenize and remove min words on "training set"
tfidf_vectorizer = TfidfVectorizer(tokenizer=tt.tokenize, ngram_range=(1,1), min_df=WORD_CUTOFF, max_df=MAX_CUTOFF)
tfidf_vectorizer.fit(training_data)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=0, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x7fdca05a7ba8>>,
                use_idf=True, vocabulary=None)

In [12]:
# filter out bad words for both sets
training_vec = tfidf_vectorizer.transform(training_data)
testing_vec = tfidf_vectorizer.transform(testing_data)

print(training_vec.shape)
print(testing_vec.shape)

(65000, 61247)
(5000, 61247)


In [13]:
# revert back to word space - unordered tokens
training_set = tfidf_vectorizer.inverse_transform(training_vec) 
testing_set = tfidf_vectorizer.inverse_transform(testing_vec)

#generate vocab space
items = []
for comment in training_set:
    items.extend(comment)
    
set_vocab = set(items)
word_to_ix = {word: i+1 for i, word in enumerate(set_vocab)} # leave zero index for null

print("vocab size:", len(set_vocab))

matrix_len = len(set_vocab)+1 # allow set buffer
weights_matrix = np.zeros((matrix_len, VEC_SIZE), dtype = np.float32) #


# w2v auto encoding
if WORD2VEC:
    
    sentences = []
    for comment in training_data:
        sentences.append(comment.split())
    
    word2Vec = gensim.models.Word2Vec(
        sentences, 
        size=VEC_SIZE, 
        window=WINDOW_SIZE, 
        min_count=MIN_COUNT, 
        iter=3
    )
    
    badwords = []    
    for i, word in enumerate(set_vocab):
        try: 
            weights_matrix[i+1] = word2Vec.wv[word]
        except KeyError:
            badwords.append(word)
            weights_matrix[i+1] = np.random.normal(scale=0.6, size=(VEC_SIZE, ))
    
    print(len(badwords))

# glove preembeddings
if GLOVE:
    
    glove = vocab.GloVe(name='6B', dim=GLOVE_SIZE)
    
    badwords = []
    weights_matrix = np.zeros((matrix_len, GLOVE_SIZE), dtype=np.float32)    
    for i, word in enumerate(set_vocab):
        try: 
            weights_matrix[i+1] = glove[word]
        except KeyError:
            badwords.append(word)
            weights_matrix[i+1] = np.random.normal(scale=0.6, size=(GLOVE_SIZE, ))
    
    print(len(badwords))

vocab size: 61247
0


In [14]:
# create objects for loaders:
training_obj = []
testing_obj = []

if TOKENIZED:
    
    ## Untokenize to proper format
    
    print("tokenizing space!")
    for i in range(len(training_set)):
        elem = TreebankWordDetokenizer().detokenize(training_set[i])
        training_obj.append((elem, training_labels[i]))

    
    for i in range(len(testing_set)):
        elem = TreebankWordDetokenizer().detokenize(testing_set[i])
        testing_obj.append((elem, testing_labels[i]))

else:
    for i in range(training_data.shape[0]):
        training_obj.append((training_data[i], training_labels[i]))

    
    for i in range(testing_data.shape[0]):
        testing_obj.append((testing_data[i], testing_labels[i]))

training_obj = np.asarray(training_obj)
testing_obj = np.asarray(testing_obj)

print(training_obj.shape)
print(testing_obj.shape)

print(training_obj[0])
print(testing_obj[0])

(65000, 2)
(5000, 2)
['honestly buffalo is the correct answer i remember people somewhat joking that buffalo s mantra for starting goalies was win a game get traded i think edmonton s front office was a travesty for the better part of num years but buffalo s systematic destruction of the term competitive was much more responsible for the change to the draft lottery '
 'hockey']
['lol hot or cold your choice isnt that what coffee drinkers do now run filter hot water over ground up coffee beans to get the flavor and caffine out then drink it my way just gets rid of the flavor and most of the water and still gives you the caffine fix buzz '
 'canada']


## Data Object

In [15]:
# disregard words not in corpus
def sentenceToVec(sentence, lut, mLength=SENTENCE_SIZE):
    vec = np.zeros((mLength), dtype=np.int)
    words = sentence.split()
    jdx = 0
    for idx, word in enumerate(words):
        if idx >= mLength:
            break
        try:
            vec[jdx] = lut[word]
            jdx += 1
        except:
            continue
            # null word out of set - keep 0
    return vec

In [16]:
# data loader fir train and test
class CommentData(Dataset):
    
    def __init__(self, comments, mapping, labels=labels):
        self.frames = comments
        self.labels = labels
        self.mapping = mapping

    def __len__(self):
        return len(self.frames)

    def __getitem__(self, idx):
        element, label = self.frames[idx]
        enc_element = sentenceToVec(element, lut=self.mapping)
        enc_label = self.encode(label)
        return (enc_element, enc_label)
    
    # one-hot encoding on element fetch
    def encode(self, label):
        location = self.labels.index(label)
        return location

# Models

In [17]:
# LSTM/GRU - tanh+softmax mixed attention
#https://arxiv.org/pdf/1703.03130.pdf
class SelfAttention(nn.Module):
    def __init__(self, num_layers, output_size, hidden_dim, weights, freeze, cell_style='LSTM'):
        super(SelfAttention, self).__init__()

        self.num_layers = num_layers
        self.style = cell_style
        self.hidden_dim = hidden_dim
        self.penalty = 0
        
        vocab_size, embedding_dim = weights_matrix.shape
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.word_embeddings.load_state_dict({'weight': torch.Tensor(weights)})
        
        if freeze:
            self.word_embeddings.requires_grad = False
        
        if cell_style is 'LSTM':
            self.memory = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, num_layers=num_layers)
        elif cell_style is 'GRU':
            self.memory = nn.GRU(embedding_dim, hidden_dim,  bidirectional=True, num_layers=num_layers)

        self.dropout = nn.Dropout(0.2)
        self.mixer1 = nn.Linear(2*hidden_dim, 124, bias=False)
        self.mixer2 = nn.Linear(124, 12, bias=False)
        self.linear = nn.Linear(12*2*hidden_dim, 1024)
        self.output = nn.Linear(1024, output_size)

    def selfAttention(self, mem_out):
        attention = self.mixer2(torch.tanh(self.mixer1(mem_out)))
        attention = attention.permute(0, 2, 1)
        attention = F.softmax(attention, dim=2)
        
        return attention

    def forward(self, sentences):

        embeds = self.word_embeddings(sentences)
        embeds = embeds.permute(1, 0, 2)
        
        if self.style is 'LSTM':
            state = (Variable(torch.zeros(2*self.num_layers, sentences.size(0), self.hidden_dim).cuda()),
                     Variable(torch.zeros(2*self.num_layers, sentences.size(0), self.hidden_dim).cuda()))
            output, (hidden_state, cell_state) = self.memory(embeds, state)
        
        elif self.style is 'GRU':
            state = Variable(torch.zeros(2*self.num_layers, sentences.size(0), self.hidden_dim).cuda())
            output, hidden_state = self.memory(embeds, state)
    
        output = output.permute(1, 0, 2)
        attention = self.selfAttention(output)
        hidden_state = torch.bmm(attention, output)

        scores = self.output(self.dropout(self.linear(hidden_state.view(-1, hidden_state.size()[1]*hidden_state.size()[2]))))
        
        # penalty factor
        AAT = torch.bmm(attention, attention.transpose(1,2)).cuda()
        I = (torch.eye(12).unsqueeze(0).repeat(sentences.size(0), 1, 1)).cuda()
        self.penalty = torch.norm(AAT - I) / sentences.size(0)
        
        return scores

In [18]:
net = SelfAttention(NUM_LAYERS, 20, HIDDEN_SIZE, weights_matrix, freeze=WEIGHT_FREEZE, cell_style=CELL_TYPE).to(device)
print(net)

SelfAttention(
  (word_embeddings): Embedding(61248, 300)
  (memory): LSTM(300, 128, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (mixer1): Linear(in_features=256, out_features=124, bias=False)
  (mixer2): Linear(in_features=124, out_features=12, bias=False)
  (linear): Linear(in_features=3072, out_features=1024, bias=True)
  (output): Linear(in_features=1024, out_features=20, bias=True)
)


# Optimizers

In [19]:
loss = nn.CrossEntropyLoss().to(device)
optimizer = optim.RMSprop(net.parameters(), lr=3e-4)

# Training Loop

In [20]:
epochs = 20

train = CommentData(training_obj, word_to_ix)
val = CommentData(testing_obj, word_to_ix)

train_loader = torch.utils.data.DataLoader(train, batch_size=256, num_workers=8, shuffle=True)
val_loader = torch.utils.data.DataLoader(val, batch_size=256, num_workers=8)
# train cycle here
for epoch in range(epochs):

    net.train()
    running_loss = 0.0
    correct = 0.
    total = 0.

    for i, (comment, target) in enumerate(train_loader):
        # tensor to device
        comment = comment.to(device=device, dtype=torch.int64)
        target = target.to(device=device, dtype=torch.int64)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        output = net(comment)
        error = loss(output, target) + 3*net.penalty
        error.backward()
        torch.nn.utils.clip_grad_norm_(net.parameters(), 0.3)
        optimizer.step()

        # print statistics
        running_loss += error.item()
        if i % 50 == 49:    # print every 50 mini-batches
            print('[%d, %5d] loss: %.5f' %
                  (epoch + 1, i + 1, running_loss / 50))
            running_loss = 0.0

        # Get predictions
        preds = F.softmax(output, dim=1)
        preds_cls = preds.argmax(dim=1)

        # Count number of correct predictions
        correct_preds = torch.eq(preds_cls, target)
        correct += torch.sum(correct_preds).detach().cpu().item()
        total += len(correct_preds)

    train_acc = correct / total
    print("Epoch:", epoch+1,"Training Acc:",train_acc)

    net.eval()
    correct = 0.
    total = 0.

    for i, (comment, target) in enumerate(val_loader):

        comment = comment.to(device=device, dtype=torch.int64)
        target = target.to(device=device, dtype=torch.int64)
        output = net(comment)

        # Get predictions
        preds = F.softmax(output, dim=1)
        preds_cls = preds.argmax(dim=1)

        # Count number of correct predictions
        correct_preds = torch.eq(preds_cls, target)
        correct += torch.sum(correct_preds).detach().cpu().item()
        total += len(correct_preds)

    valid_acc = correct / total
    print("Epoch:", epoch+1,"Validation Acc:",valid_acc)

[1,    50] loss: 3.33828
[1,   100] loss: 2.95235
[1,   150] loss: 2.69866
[1,   200] loss: 2.60977
[1,   250] loss: 2.48757
Epoch: 1 Training Acc: 0.3329384615384615
Epoch: 1 Validation Acc: 0.4444
[2,    50] loss: 2.39592
[2,   100] loss: 2.33490
[2,   150] loss: 2.28044
[2,   200] loss: 2.27036
[2,   250] loss: 2.23764
Epoch: 2 Training Acc: 0.5012307692307693
Epoch: 2 Validation Acc: 0.505
[3,    50] loss: 2.13482
[3,   100] loss: 2.12726
[3,   150] loss: 2.12061
[3,   200] loss: 2.10979
[3,   250] loss: 2.08470
Epoch: 3 Training Acc: 0.5570923076923077
Epoch: 3 Validation Acc: 0.5294
[4,    50] loss: 1.97546
[4,   100] loss: 1.98080
[4,   150] loss: 1.97545
[4,   200] loss: 1.97389
[4,   250] loss: 1.95740
Epoch: 4 Training Acc: 0.5986923076923077
Epoch: 4 Validation Acc: 0.5432
[5,    50] loss: 1.84248
[5,   100] loss: 1.85083
[5,   150] loss: 1.83000
[5,   200] loss: 1.85713
[5,   250] loss: 1.86363
Epoch: 5 Training Acc: 0.6326615384615385
Epoch: 5 Validation Acc: 0.5506
[6,   

# K-FOLD TESTING SCRIPT
    - Adapted for flags best model - will not run if settings are changed

In [21]:
# leverages pandas for fast csv load but operates in numpy
class kFold():
    def __init__(self, data, numFolds=5):
        self.data = data
        self.numFolds = numFolds
        self.splits = []
        
    def generateSplits(self):
        #np.random.shuffle(self.data)
        
        folds = []
        splitPoint = self.data.shape[0] // (self.numFolds)  #breakpoint index jump
        
        for i in range(self.numFolds - 1):
            folds.append(self.data[i*splitPoint:(i+1)*splitPoint, :])
            
        folds.append(self.data[(i+1)*splitPoint:,:]) #get extra points in last batch
        
        # create split permutations 80/10/10
        foldDivisor = len(folds[0]) // 2
        for i in range(self.numFolds):
            train = []
            for k in range(self.numFolds):
                if i == k:
                    validation = folds[i][:foldDivisor] 
                    test = folds[i][foldDivisor:] 
                else:
                    train.append(folds[k])
            
            train = np.vstack(train) # adapt dims
            self.splits.append((train, validation, test))

In [22]:
def unpack(subset):
    data = []
    labels = []

    for x,y in subset:
        data.append(x)
        labels.append(y)

    data = np.array(data)
    labels = np.array(data)
    
    return (data, labels)

In [23]:
# leverages pandas for fast csv load but operates in numpy
class kFold():
    def __init__(self, data, numFolds=5):
        self.data = data
        self.numFolds = numFolds
        self.splits = []
        
    def generateSplits(self):
        np.random.shuffle(self.data)
        
        folds = []
        splitPoint = self.data.shape[0] // (self.numFolds)  #breakpoint index jump
        
        for i in range(self.numFolds - 1):
            folds.append(self.data[i*splitPoint:(i+1)*splitPoint, :])
            
        folds.append(self.data[(i+1)*splitPoint:,:]) #get extra points in last batch
        
        # create split permutations 80/10/10
        foldDivisor = len(folds[0]) // 2
        for i in range(self.numFolds):
            train = []
            for k in range(self.numFolds):
                if i == k:
                    validation = folds[i][:foldDivisor] 
                    test = folds[i][foldDivisor:] 
                else:
                    train.append(folds[k])
            
            train = np.vstack(train) # adapt dims
            self.splits.append((train, validation, test))

In [24]:
# gen tuples for splitting
kfold_data = []
for idx in range(clean_data.shape[0]):
    item = (clean_data[idx], clean_labels[idx])
    kfold_data.append(item)
kfold_data = np.asarray(kfold_data)

commentFolds = kFold(kfold_data)
commentFolds.generateSplits()
splits = commentFolds.splits

In [29]:
best_vals = [0,0,0,0,0]
best_test = [0,0,0,0,0]

for s, split in enumerate(splits):
    # unpacl split
    train, val, test = split
    
    # unpack training data for space generation
    training_data, _ = unpack(train)
    
    # tokenize and remove min words on "training set"
    tfidf_vectorizer = TfidfVectorizer(tokenizer=tt.tokenize, ngram_range=(1,1), min_df=WORD_CUTOFF)
    training_vec = tfidf_vectorizer.fit_transform(training_data)
    
    # revert back to word space - unordered tokens
    training_set = tfidf_vectorizer.inverse_transform(training_vec) 

    #generate vocab space
    items = []
    for comment in training_set:
        items.extend(comment)

    set_vocab = set(items)
    word_to_ix = {word: i+1 for i, word in enumerate(set_vocab)} # leave zero index for null

    # embed space
    matrix_len = len(set_vocab)+1 # allow set buffer
    glove = vocab.GloVe(name='6B', dim=GLOVE_SIZE)
    weights_matrix = np.zeros((matrix_len, GLOVE_SIZE), dtype=np.float32)    
    
    for i, word in enumerate(set_vocab):
        try: 
            weights_matrix[i+1] = glove[word]
        except KeyError:
            weights_matrix[i+1] = np.random.normal(scale=0.6, size=(GLOVE_SIZE, ))

    # clear GPU cache
    torch.cuda.empty_cache()
    
    # make model and optims
    net = SelfAttention(NUM_LAYERS, 20, HIDDEN_SIZE, weights_matrix, freeze=WEIGHT_FREEZE, cell_style=CELL_TYPE).to(device)
    loss = nn.CrossEntropyLoss().to(device)
    optimizer = optim.RMSprop(net.parameters(), lr=3e-4)
    
    epochs = 10
    
    # genrate loading obj    
    trainer = CommentData(train, word_to_ix)
    validator = CommentData(val, word_to_ix)
    tester = CommentData(test, word_to_ix)

    train_loader = torch.utils.data.DataLoader(trainer, batch_size=256, num_workers=8, shuffle=True)
    val_loader = torch.utils.data.DataLoader(validator, batch_size=128, num_workers=8)
    test_loader = torch.utils.data.DataLoader(tester, batch_size=128, num_workers=8)
    
    best_score = 0
    
    # train cycle
    for epoch in range(epochs):
        
        net.train()
        running_loss = 0.0
        correct = 0.
        total = 0.
        flag = False

        for i, (comment, target) in enumerate(train_loader):
            # tensor to device
            comment = comment.to(device=device, dtype=torch.int64)
            target = target.to(device=device, dtype=torch.int64)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            output = net(comment) + 3*net.penalty
            error = loss(output, target)
            error.backward()
            optimizer.step()

            # print statistics
            running_loss += error.item()
            if i % 5000 == 4999:    # print every 50 mini-batches
                print('[%d, %5d] loss: %.5f' %
                      (epoch + 1, i + 1, running_loss / 5000))
                running_loss = 0.0

            # Get predictions
            preds = F.softmax(output, dim=1)
            preds_cls = preds.argmax(dim=1)

            # Count number of correct predictions
            correct_preds = torch.eq(preds_cls, target)
            correct += torch.sum(correct_preds).detach().cpu().item()
            total += len(correct_preds)

        train_acc = correct / total
        print("Epoch:", epoch+1,"Training Acc:",train_acc)

        net.eval()
        correct = 0.
        total = 0.

        for i, (comment, target) in enumerate(val_loader):

            comment = comment.to(device=device, dtype=torch.int64)
            target = target.to(device=device, dtype=torch.int64)
            output = net(comment)

            # Get predictions
            preds = F.softmax(output, dim=1)
            preds_cls = preds.argmax(dim=1)

            # Count number of correct predictions
            correct_preds = torch.eq(preds_cls, target)
            correct += torch.sum(correct_preds).detach().cpu().item()
            total += len(correct_preds)

        valid_acc = correct / total
        print("Epoch:", epoch+1,"Validation Acc:",valid_acc)
        
        #save best model
        if(valid_acc > best_score):
            flag = True
            best_score = valid_acc
            best_vals[s] = valid_acc
            torch.save(net.state_dict(), './net'+str(s)+'.pth.tar')
        
        net.eval()
        correct = 0.
        total = 0.

        for i, (comment, target) in enumerate(test_loader):

            comment = comment.to(device=device, dtype=torch.int64)
            target = target.to(device=device, dtype=torch.int64)
            output = net(comment)

            # Get predictions
            preds = F.softmax(output, dim=1)
            preds_cls = preds.argmax(dim=1)

            # Count number of correct predictions
            correct_preds = torch.eq(preds_cls, target)
            correct += torch.sum(correct_preds).detach().cpu().item()
            total += len(correct_preds)

        test_acc = correct / total
        print("Epoch:", epoch+1,"Testing Acc:",test_acc)
        if flag:
            best_test[s] = test_acc

print("Finished Cross Val")

Epoch: 1 Training Acc: 0.32353571428571426
Epoch: 1 Validation Acc: 0.45071428571428573
Epoch: 1 Testing Acc: 0.44985714285714284
Epoch: 2 Training Acc: 0.4996607142857143
Epoch: 2 Validation Acc: 0.5064285714285715
Epoch: 2 Testing Acc: 0.4977142857142857
Epoch: 3 Training Acc: 0.555875
Epoch: 3 Validation Acc: 0.5331428571428571
Epoch: 3 Testing Acc: 0.527
Epoch: 4 Training Acc: 0.597625
Epoch: 4 Validation Acc: 0.5441428571428572
Epoch: 4 Testing Acc: 0.5358571428571428
Epoch: 5 Training Acc: 0.6340178571428572
Epoch: 5 Validation Acc: 0.5554285714285714
Epoch: 5 Testing Acc: 0.5415714285714286
Epoch: 6 Training Acc: 0.6719107142857143
Epoch: 6 Validation Acc: 0.5481428571428572
Epoch: 6 Testing Acc: 0.5417142857142857
Epoch: 7 Training Acc: 0.7092142857142857
Epoch: 7 Validation Acc: 0.5511428571428572
Epoch: 7 Testing Acc: 0.545
Epoch: 8 Training Acc: 0.7489107142857143
Epoch: 8 Validation Acc: 0.5455714285714286
Epoch: 8 Testing Acc: 0.5422857142857143
Epoch: 9 Training Acc: 0.78

In [30]:
print(np.max(best_vals))
print(np.mean(best_vals))
print(np.std(best_vals))
print(np.max(best_test))
print(np.mean(best_test))
print(np.std(best_test))

0.5598571428571428
0.5502857142857143
0.009407335392324063
0.5582857142857143
0.5456857142857142
0.0065205483680751955
