# Baseline Assumptions
    - All preprocessing is completed and has been stored in a .csv file
    - There exists no "bad" data such that an associated label is out of the labeled set
    - The vectors placed into the training set are of the same form as the test set
        - There exists no errors due to unseen words

# Imports

In [1]:
# generic
import os

# Data management
import csv
import pandas as pd
from torch.utils.data import Dataset

#nlp
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.tag import pos_tag_sents
from nltk.stem import WordNetLemmatizer
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from pymfe.mfe import MFE

# Deep learning 
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

# Math and plots
import numpy as np
import random
import matplotlib.pyplot as plt

# GPU

In [2]:
device = torch.device("cuda")

# Flags

In [3]:
NEW_MODEL = True
TRAIN = True
SPLIT_USE = 1

# Data Loader
    - Assumes data is preprossessed such that no transformation must be done on load
    - Does not load element as a tensor
    - Return a descriptor vector and an encoded vector

In [4]:
# filepaths
train_data = './Data/reddit_train.csv'
test_data = './Data/name_of_test.csv'

# gloabal labels
labels = ['hockey', 'nba', 'leagueoflegends', 'soccer', \
          'funny', 'movies', 'anime', 'Overwatch', 'trees', \
          'GlobalOffensive', 'nfl', 'AskReddit', 'gameofthrones', \
          'conspiracy', 'worldnews', 'wow', 'europe', 'canada', \
          'Music', 'baseball']

## Cleaning and Preprocessing

In [5]:
#load
comment_data = pd.read_csv(train_data)

#clean
comment_data['prep'] = comment_data['comments'].str.replace(r'[^\w\s]+', '')
comment_data['prep'] = comment_data['prep'].str.lower()
comment_data['prep'] = comment_data['prep'].str.replace('(\d+)', ' num ')
comment_data['prep'] = comment_data['prep'].str.replace(r'http(?<=http).*', ' wasurl ')
comment_data['prep'] = comment_data['prep'].str.replace(r'\s+', " ")
comment_data['prep'] = comment_data['prep'].str.replace(" +", " ")

# lemmatization
lemmatizer = WordNetLemmatizer()
tt = TweetTokenizer()
def lemmatize_col(row):
    row = tt.tokenize(row)
    return ' '.join([lemmatizer.lemmatize(w) for w in row])
comment_data['prep'] = comment_data.prep.apply(lemmatize_col)

tfidf_vectorizer = TfidfVectorizer(tokenizer=tt.tokenize, stop_words="english", ngram_range=(1,1))
tfidf = tfidf_vectorizer.fit_transform(comment_data.prep)

In [6]:
#binning
comments = []
vector_idf = tfidf.toarray()
for idx, vec in enumerate(vector_idf):
    item = np.array((vec, comment_data['subreddits'][idx]))
    comments.append(item)
comments = np.asarray(comments)
print(comments[0][0].shape)

(68455,)


## Data Object

In [7]:
# data loader fir train and test
class CommentData(Dataset):
    
    def __init__(self, data, labels=labels):
        self.frames = comments
        self.labels = labels

    def __len__(self):
        return len(self.frames)

    def __getitem__(self, idx):
        element, label = self.frames[idx]
        enc_label = self.encode(label)
        return (element, enc_label)
    
    # one-hot encoding on element fetch
    def encode(self, label):
        location = self.labels.index(label)
        return location

## K-Fold 

In [8]:

# leverages pandas for fast csv load but operates in numpy
class kFold():
    def __init__(self, data, numFolds=5):
        self.data = data
        self.numFolds = numFolds
        self.splits = []
        
    def generateSplits(self):
        #np.random.shuffle(self.data)
        
        folds = []
        splitPoint = self.data.shape[0] // (self.numFolds)  #breakpoint index jump
        
        for i in range(self.numFolds - 1):
            folds.append(self.data[i*splitPoint:(i+1)*splitPoint, :])
            
        folds.append(self.data[(i+1)*splitPoint:,:]) #get extra points in last batch
        
        # create split permutations 80/10/10
        foldDivisor = len(folds) // 2
        for i in range(self.numFolds):
            train = []
            for k in range(self.numFolds):
                if i == k:
                    validation = folds[i][:foldDivisor] 
                    test = folds[i][foldDivisor:] 
                else:
                    train.append(folds[k])
            
            train = np.vstack(train) # adapt dims
            self.splits.append((train, validation, test))

## Model

In [9]:
class WhoReddit(nn.Module):

    def __init__(self):
        super(WhoReddit, self).__init__()
        
        # mini inception net block 1
        self.convA1 = nn.Conv1d(1, 64, 3, padding = 1)
        self.normA1 = nn.BatchNorm1d(64)
        self.reluA1 = nn.ReLU(True)
        self.poolA1 = nn.MaxPool1d(3, 3)
        
        self.convB1 = nn.Conv1d(1, 64, 5, padding = 2)
        self.normB1 = nn.BatchNorm1d(64)
        self.reluB1 = nn.ReLU(True)
        self.poolB1 = nn.MaxPool1d(3, 3)
        
        self.convC1 = nn.Conv1d(1, 64, 7, padding = 3)
        self.normC1 = nn.BatchNorm1d(64)
        self.reluC1 = nn.ReLU(True)
        self.poolC1 = nn.MaxPool1d(3, 3)
        
        self.blend1 = nn.Sequential(
            nn.Conv1d(3*64, 96, 3, padding = 1),
            nn.BatchNorm1d(96),
            nn.ReLU(True),
            nn.MaxPool1d(3,3),
            nn.Dropout(0.5)
        )
        
        # mini inception net block 2
        self.convA2 = nn.Conv1d(96, 128, 3, padding = 1)
        self.normA2 = nn.BatchNorm1d(128)
        self.reluA2 = nn.ReLU(True)
        self.poolA2 = nn.MaxPool1d(3, 3)
        
        self.convB2 = nn.Conv1d(96, 128, 5, padding = 2)
        self.normB2 = nn.BatchNorm1d(128)
        self.reluB2 = nn.ReLU(True)
        self.poolB2 = nn.MaxPool1d(3, 3)
        
        self.convC2 = nn.Conv1d(96, 128, 7, padding = 3)
        self.normC2 = nn.BatchNorm1d(128)
        self.reluC2 = nn.ReLU(True)
        self.poolC2 = nn.MaxPool1d(3, 3)
    
        self.blend2 = nn.Sequential(
            nn.Conv1d(3*128, 196, 3, padding = 1),
            nn.BatchNorm1d(196),
            nn.ReLU(True),
            nn.MaxPool1d(3,3)
        )
        
        # mini inception net block 3
        self.convA3 = nn.Conv1d(196, 256, 3, padding = 1)
        self.normA3 = nn.BatchNorm1d(256)
        self.reluA3 = nn.ReLU(True)
        self.poolA3 = nn.MaxPool1d(3, 3)
        
        # core modules
        self.convB3 = nn.Conv1d(196, 256, 5, padding = 2)
        self.normB3 = nn.BatchNorm1d(256)
        self.reluB3 = nn.ReLU(True)
        self.poolB3 = nn.MaxPool1d(3, 3)
        
        self.convC3 = nn.Conv1d(196, 256, 7, padding = 3)
        self.normC3 = nn.BatchNorm1d(256)
        self.reluC3 = nn.ReLU(True)
        self.poolC3 = nn.MaxPool1d(3, 3)
    
        self.merge = nn.Sequential(
            nn.Conv1d(3*256, 256, 3, padding = 1),
            nn.BatchNorm1d(256),
            nn.ReLU(True),
            nn.MaxPool1d(3,3),
            nn.Dropout(0.2),
        )
        
        self.linear = nn.Linear(23808, 20)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        
        A = self.dropout(self.poolA1(self.reluA1(self.normA1(self.convA1(x)))))
        B = self.dropout(self.poolB1(self.reluB1(self.normB1(self.convB1(x)))))
        C = self.dropout(self.poolC1(self.reluC1(self.normC1(self.convC1(x)))))
        x = torch.cat((A,B,C), dim=1)
        x = self.blend1(x)
        
        A = self.dropout(self.poolA2(self.reluA2(self.normA2(self.convA2(x)))))
        B = self.dropout(self.poolB2(self.reluB2(self.normB2(self.convB2(x)))))
        C = self.dropout(self.poolC2(self.reluC2(self.normC2(self.convC2(x)))))
        x = torch.cat((A,B,C), dim=1)
        x = self.blend2(x)
        
        A = self.dropout(self.poolA3(self.reluA3(self.normA3(self.convA3(x)))))
        B = self.dropout(self.poolB3(self.reluB3(self.normB3(self.convB3(x)))))
        C = self.dropout(self.poolC3(self.reluC3(self.normC3(self.convC3(x)))))
        x = torch.cat((A,B,C), dim=1)
        x = self.merge(x)
        x = x.view(x.size(0), -1)
        x = self.linear(x)

        return x


In [None]:
class VanillaWhoReddit(nn.Module):

    def __init__(self):
        super(VanillaWhoReddit, self).__init__()
        
        # mini inception net block 1
        self.convA1 = nn.Conv1d(1, 64, 3, padding = 1)
        self.normA1 = nn.BatchNorm1d(256)
        self.reluA1 = nn.ReLU(True)
        
        self.convA2 = nn.Conv1d(64, 64, 3, padding = 1)
        self.normA2 = nn.BatchNorm1d(64)
        self.reluA2 = nn.ReLU(True)
        
        self.convA3 = nn.Conv1d(64, 64, 3, padding = 1)
        self.normA3 = nn.BatchNorm1d(64)
        self.reluA3 = nn.ReLU(True)
        
        self.poolA = nn.MaxPool1d(4, 4)
        
        self.convB1 = nn.Conv1d(64, 128, 3, padding = 1)
        self.normB1 = nn.BatchNorm1d(128)
        self.reluB1 = nn.ReLU(True)
        
        self.convB2 = nn.Conv1d(128, 128, 3, padding = 1)
        self.normB2 = nn.BatchNorm1d(128)
        self.reluB2 = nn.ReLU(True)
        
        self.convB3 = nn.Conv1d(128, 128, 3, padding = 1)
        self.normB3 = nn.BatchNorm1d(128)
        self.reluB3 = nn.ReLU(True)
        
        self.poolB = nn.MaxPool1d(4, 4)
        
        self.convC1 = nn.Conv1d(128, 256, 3, padding = 1)
        self.normC1 = nn.BatchNorm1d(256)
        self.reluC1 = nn.ReLU(True)
        
        self.convC1 = nn.Conv1d(256, 256, 3, padding = 1)
        self.normC1 = nn.BatchNorm1d(256)
        self.reluC1 = nn.ReLU(True)
        
        self.convC1 = nn.Conv1d(256, 256, 3, padding = 1)
        self.normC1 = nn.BatchNorm1d(256)
        self.reluC1 = nn.ReLU(True)
        
        self.poolC = nn.MaxPool1d(4, 4)
        
        self.convD1 = nn.Conv1d(256, 512, 3, padding = 1)
        self.normD1 = nn.BatchNorm1d(512)
        self.reluD1 = nn.ReLU(True)
        
        self.convD2 = nn.Conv1d(512, 512, 3, padding = 1)
        self.normD2 = nn.BatchNorm1d(512)
        self.reluD2 = nn.ReLU(True)
        
        self.convD3 = nn.Conv1d(512, 512, 3, padding = 1)
        self.normD3 = nn.BatchNorm1d(512)
        self.reluD3 = nn.ReLU(True)
        
        self.poolD = nn.MaxPool1d(4, 4)
        
        self.convE1 = nn.Conv1d(256, 512, 3, padding = 1)
        self.normE1 = nn.BatchNorm1d(512)
        self.reluE1 = nn.ReLU(True)
        
        self.convE2 = nn.Conv1d(512, 512, 3, padding = 1)
        self.normE2 = nn.BatchNorm1d(512)
        self.reluE2 = nn.ReLU(True)
        
        self.convE3 = nn.Conv1d(512, 512, 3, padding = 1)
        self.normE3 = nn.BatchNorm1d(512)
        self.reluE3 = nn.ReLU(True)
        
        self.poolE = nn.MaxPool1d(4, 4)
        
        self.linear = nn.Linear(23808, 20)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        
        A = self.dropout(self.poolA1(self.reluA1(self.normA1(self.convA1(x)))))
        B = self.dropout(self.poolB1(self.reluB1(self.normB1(self.convB1(x)))))
        C = self.dropout(self.poolC1(self.reluC1(self.normC1(self.convC1(x)))))
        x = torch.cat((A,B,C), dim=1)
        x = self.blend1(x)
        
        A = self.dropout(self.poolA2(self.reluA2(self.normA2(self.convA2(x)))))
        B = self.dropout(self.poolB2(self.reluB2(self.normB2(self.convB2(x)))))
        C = self.dropout(self.poolC2(self.reluC2(self.normC2(self.convC2(x)))))
        x = torch.cat((A,B,C), dim=1)
        x = self.blend2(x)
        
        A = self.dropout(self.poolA3(self.reluA3(self.normA3(self.convA3(x)))))
        B = self.dropout(self.poolB3(self.reluB3(self.normB3(self.convB3(x)))))
        C = self.dropout(self.poolC3(self.reluC3(self.normC3(self.convC3(x)))))
        x = torch.cat((A,B,C), dim=1)
        x = self.merge(x)
        x = x.view(x.size(0), -1)
        x = self.linear(x)

        return x


In [10]:
if NEW_MODEL:
    net = WhoReddit().to(device)
else:
    print('save a model') #todo: load network

print(net)

WhoReddit(
  (convA1): Conv1d(1, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (normA1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (reluA1): ReLU(inplace=True)
  (poolA1): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  (convB1): Conv1d(1, 64, kernel_size=(5,), stride=(1,), padding=(2,))
  (normB1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (reluB1): ReLU(inplace=True)
  (poolB1): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  (convC1): Conv1d(1, 64, kernel_size=(7,), stride=(1,), padding=(3,))
  (normC1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (reluC1): ReLU(inplace=True)
  (poolC1): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  (blend1): Sequential(
    (0): Conv1d(192, 96, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): BatchNorm1d(96, eps=1e-05, momentum=0.1, af

### Loss and Optimizer

In [11]:
loss = nn.CrossEntropyLoss().type(device)
optimizer = optim.Adam(net.parameters(), lr=1e-3)

## Training

In [12]:
# split init
commentFolds = kFold(comments) 
commentFolds.generateSplits()
splits = commentFolds.splits

In [13]:
epochs = 5

In [14]:
for idx, split in enumerate(splits): # split
    
    train, val, test = split
    
    # Dataset obj
    train_set = CommentData(train)
    val_set = CommentData(val)
    test_set = CommentData(test)
    
    # Data loaders
    train_loader = torch.utils.data.DataLoader(train_set, batch_size=24, num_workers=8)
    val_loader = torch.utils.data.DataLoader(val_set, batch_size=4, num_workers=8)
    test_loader = torch.utils.data.DataLoader(test_set, batch_size=4, num_workers=8)
    
    # train cycle here
    for epoch in range(epochs):
        
        net.train()
        running_loss = 0.0
        correct = 0.
        total = 0.
        
        for i, (comment, target) in enumerate(train_loader):
            
            # tensor to device
            comment = comment.to(device=device, dtype=torch.float32).unsqueeze(1)
            target = target.to(device=device, dtype=torch.int64)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            output = net(comment)
            error = loss(output, target)
            error.backward()
            optimizer.step()

            # print statistics
            running_loss += error.item()
            if i % 500 == 499:    # print every 50 mini-batches
                print('[%d, %5d] loss: %.5f' %
                      (epoch + 1, i + 1, running_loss / 500))
                running_loss = 0.0

            # Get predictions
            preds = F.softmax(output, dim=1)
            preds_cls = preds.argmax(dim=1)

            # Count number of correct predictions
            correct_preds = torch.eq(preds_cls, target)
            correct += torch.sum(correct_preds).cpu().item()
            total += len(correct_preds)

        train_acc = correct / total
        print("Epoch:", epoch+1,"Training Acc:",train_acc)

        net.eval()
        correct = 0.
        total = 0.
        
        for i, (comment, target) in enumerate(val_loader):

            comment = comment.to(device=device, dtype=torch.float32).unsqueeze(1)
            target = target.to(device=device, dtype=torch.int64)
            output = net(comment)

            # Get predictions
            preds = F.softmax(output, dim=1)
            preds_cls = preds.argmax(dim=1)

            # Count number of correct predictions
            correct_preds = torch.eq(preds_cls, target)
            correct += torch.sum(correct_preds).cpu().item()
            total += len(correct_preds)

        valid_acc = correct / total
        print("Epoch:", epoch+1,"Validation Acc:",valid_acc)
        
        for i, (comment, target) in enumerate(test_loader):

            comment = comment.to(device=device, dtype=torch.float32).unsqueeze(1)
            target = target.to(device=device, dtype=torch.int64)
            output = net(comment)

            # Get predictions
            preds = F.softmax(output, dim=1)
            preds_cls = preds.argmax(dim=1)

            # Count number of correct predictions
            correct_preds = torch.eq(preds_cls, target)
            correct += torch.sum(correct_preds).cpu().item()
            total += len(correct_preds)

        test_acc = correct / total
        print("Epoch:", epoch+1,"Test Acc:",test_acc)
        

    print('Finished Training')

    # terminate cycle
    if idx-1 >= FOLD_USE:
        break

[1,   500] loss: 3.15315
[1,  1000] loss: 2.58207
[1,  1500] loss: 2.43092
[1,  2000] loss: 2.33090
Epoch: 1 Training Acc: 0.2467


RuntimeError: CUDA out of memory. Tried to allocate 536.00 MiB (GPU 0; 11.17 GiB total capacity; 10.15 GiB already allocated; 499.31 MiB free; 199.14 MiB cached)