# Boi 

# Imports

In [1]:
# generic
import os

# Data management
import csv
import pandas as pd
import pickle
from torch.utils.data import Dataset

#nlp
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.tag import pos_tag_sents
from nltk.stem import WordNetLemmatizer
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from pymfe.mfe import MFE

# Deep learning 
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

# Math and plots
import numpy as np
import random
import matplotlib.pyplot as plt

# GPU

In [2]:
device = torch.device("cuda")

# Flags

In [3]:
NEW_MODEL = True
if NEW_MODEL:
    model_type = 'inception'
else:
    model_type = 'vanilla'
TRAIN = True
FOLD_USE = 1
LOAD_FEATS = False

# Data Loader

In [4]:
# filepaths
train_data = './Data/reddit_train.csv'
test_path = './Data/reddit_test.csv'

# gloabal labels
labels = ['hockey', 'nba', 'leagueoflegends', 'soccer', \
          'funny', 'movies', 'anime', 'Overwatch', 'trees', \
          'GlobalOffensive', 'nfl', 'AskReddit', 'gameofthrones', \
          'conspiracy', 'worldnews', 'wow', 'europe', 'canada', \
          'Music', 'baseball']

## Cleaning and Preprocessing

In [5]:
if not LOAD_FEATS:
    #load
    comment_data = pd.read_csv(train_data)

    #clean
    comment_data['prep'] = comment_data['comments'].str.replace(r'[^\w\s]+', '')
    comment_data['prep'] = comment_data['prep'].str.lower()
    comment_data['prep'] = comment_data['prep'].str.replace('(\d+)', ' num ')
    comment_data['prep'] = comment_data['prep'].str.replace(r'http(?<=http).*', ' wasurl ')
    comment_data['prep'] = comment_data['prep'].str.replace(r'\s+', " ")
    comment_data['prep'] = comment_data['prep'].str.replace(" +", " ")

    #load
    test_data = pd.read_csv(test_path)

    #clean
    test_data['prep'] = test_data['comments'].str.replace(r'[^\w\s]+', '')
    test_data['prep'] = test_data['prep'].str.lower()
    test_data['prep'] = test_data['prep'].str.replace('(\d+)', ' num ')
    test_data['prep'] = test_data['prep'].str.replace(r'http(?<=http).*', ' wasurl ')
    test_data['prep'] = test_data['prep'].str.replace(r'\s+', " ")
    test_data['prep'] = test_data['prep'].str.replace(" +", " ")

In [6]:
if not LOAD_FEATS:
    # lemmatization
    lemmatizer = WordNetLemmatizer()
    tt = TweetTokenizer()
    def lemmatize_col(row):
        row = tt.tokenize(row)
        return ' '.join([lemmatizer.lemmatize(w) for w in row])


    comment_data['prep'] = comment_data.prep.apply(lemmatize_col)
    test_data['prep'] = test_data.prep.apply(lemmatize_col)

    full_set = pd.concat([comment_data['prep'],test_data['prep']])

    # vectorization
    tfidf_vectorizer = TfidfVectorizer(tokenizer=tt.tokenize, stop_words="english", ngram_range=(1,2))
    tfidf = tfidf_vectorizer.fit_transform(full_set)
    print(tfidf.shape)

In [20]:
# feature space reduction
tsvd = TruncatedSVD(n_components=1000)
reduced_features = tsvd.fit_transform(tfidf)
print(reduced_features.shape)

(100000, 1000)


In [None]:
try:
    # feature space reduction
    tsvd = TruncatedSVD(n_components=2000)
    reduced_features = tsvd.fit_transform(tfidf)
    print(reduced_features.shape)
except:
    print(failed)
    
try:
    # feature space reduction
    tsvd = TruncatedSVD(n_components=3000)
    reduced_features = tsvd.fit_transform(tfidf)
    print(reduced_features.shape)
except:
    print(failed)
    
try:
    # feature space reduction
    tsvd = TruncatedSVD(n_components=4000)
    reduced_features = tsvd.fit_transform(tfidf)
    print(reduced_features.shape)
except:
    print(failed)
    
try:
    # feature space reduction
    tsvd = TruncatedSVD(n_components=5000)
    reduced_features = tsvd.fit_transform(tfidf)
    print(reduced_features.shape)
except:
    print(failed)

In [None]:
np.save('./truncated.npy', reduced_features) # save
reduced_features = np.load('./truncated.npy') # load

In [19]:
# separation
train_stack = reduced_features[:70000]
test_stack = reduced_features[70000:]
print(train_stack.shape)
print(test_stack.shape)

(70000, 850)
(30000, 850)


In [None]:
#binning and adapting for pipeline
train_comments = []
for idx, vec in enumerate(train_stack):
    item = np.array((vec, comment_data['subreddits'][idx]))
    train_comments.append(item)
train_comments = np.asarray(train_comments)
test_comments = test_stack 

print(train_comments.shape)
print(test_comments.shape)

## Data Object

In [None]:
# data loader fir train and test
class CommentData(Dataset):
    
    def __init__(self, comments, labels=labels):
        self.frames = comments
        self.labels = labels

    def __len__(self):
        return len(self.frames)

    def __getitem__(self, idx):
        element, label = self.frames[idx]
        enc_label = self.encode(label)
        return (element, enc_label)
    
    # one-hot encoding on element fetch
    def encode(self, label):
        location = self.labels.index(label)
        return location

In [None]:
class TestData(Dataset):
    
    def __init__(self, test):
        self.frames = test
        
    def __len__(self):
        return len(self.frames)
    
    def __getitem__(self, idx):
        return self.frames[idx]

## K-Fold 

In [None]:

# leverages pandas for fast csv load but operates in numpy
class kFold():
    def __init__(self, data, numFolds=5):
        self.data = data
        self.numFolds = numFolds
        self.splits = []
        
    def generateSplits(self):
        #np.random.shuffle(self.data)
        
        folds = []
        splitPoint = self.data.shape[0] // (self.numFolds)  #breakpoint index jump
        
        for i in range(self.numFolds - 1):
            folds.append(self.data[i*splitPoint:(i+1)*splitPoint, :])
            
        folds.append(self.data[(i+1)*splitPoint:,:]) #get extra points in last batch
        
        # create split permutations 80/10/10
        foldDivisor = len(folds[0]) // 2
        for i in range(self.numFolds):
            train = []
            for k in range(self.numFolds):
                if i == k:
                    validation = folds[i][:foldDivisor] 
                    test = folds[i][foldDivisor:] 
                else:
                    train.append(folds[k])
            
            train = np.vstack(train) # adapt dims
            self.splits.append((train, validation, test))

## Model

In [None]:
def init_weights(m):
    if type(m) == nn.Conv1d:
        torch.nn.init.xavier_uniform_(m.weight)

In [None]:
#TODO: RECOMPUTE SIZE

class WhoReddit(nn.Module):

    def __init__(self):
        super(WhoReddit, self).__init__()
        
        self.convA1 = nn.Conv1d(1, 64, 3, padding = 1)
        self.normA1 = nn.BatchNorm1d(64)
        self.reluA1 = nn.ReLU(True)
        self.poolA1 = nn.MaxPool1d(3, 3)
        
        self.convB1 = nn.Conv1d(1, 64, 5, padding = 2)
        self.normB1 = nn.BatchNorm1d(64)
        self.reluB1 = nn.ReLU(True)
        self.poolB1 = nn.MaxPool1d(3, 3)
        
        self.convC1 = nn.Conv1d(1, 64, 7, padding = 3)
        self.normC1 = nn.BatchNorm1d(64)
        self.reluC1 = nn.ReLU(True)
        self.poolC1 = nn.MaxPool1d(3, 3)
        
        self.blend1 = nn.Sequential(
            nn.Conv1d(3*64, 96, 3, padding = 1),
            nn.BatchNorm1d(96),
            nn.ReLU(True),
            nn.MaxPool1d(3,3),
            nn.Dropout(0.5)
        )
        
        # mini inception net block 2
        self.convA2 = nn.Conv1d(96, 128, 3, padding = 1)
        self.normA2 = nn.BatchNorm1d(128)
        self.reluA2 = nn.ReLU(True)
        self.poolA2 = nn.MaxPool1d(3, 3)
        
        self.convB2 = nn.Conv1d(96, 128, 5, padding = 2)
        self.normB2 = nn.BatchNorm1d(128)
        self.reluB2 = nn.ReLU(True)
        self.poolB2 = nn.MaxPool1d(3, 3)
        
        self.convC2 = nn.Conv1d(96, 128, 7, padding = 3)
        self.normC2 = nn.BatchNorm1d(128)
        self.reluC2 = nn.ReLU(True)
        self.poolC2 = nn.MaxPool1d(3, 3)
    
        self.blend2 = nn.Sequential(
            nn.Conv1d(3*128, 196, 3, padding = 1),
            nn.BatchNorm1d(196),
            nn.ReLU(True),
            nn.MaxPool1d(3,3)
        )
        
        # mini inception net block 3
        self.convA3 = nn.Conv1d(196, 256, 3, padding = 1)
        self.normA3 = nn.BatchNorm1d(256)
        self.reluA3 = nn.ReLU(True)
        self.poolA3 = nn.MaxPool1d(3, 3)
        
        # core modules
        self.convB3 = nn.Conv1d(196, 256, 5, padding = 2)
        self.normB3 = nn.BatchNorm1d(256)
        self.reluB3 = nn.ReLU(True)
        self.poolB3 = nn.MaxPool1d(3, 3)
        
        self.convC3 = nn.Conv1d(196, 256, 7, padding = 3)
        self.normC3 = nn.BatchNorm1d(256)
        self.reluC3 = nn.ReLU(True)
        self.poolC3 = nn.MaxPool1d(3, 3)
    
        self.merge = nn.Sequential(
            nn.Conv1d(3*256, 256, 3, padding = 1),
            nn.BatchNorm1d(256),
            nn.ReLU(True),
            nn.MaxPool1d(3,3),
            nn.Dropout(0.2),
        )
        
        self.linear = nn.Linear(23808, 20)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        
        A = self.dropout(self.poolA1(self.reluA1(self.normA1(self.convA1(x)))))
        B = self.dropout(self.poolB1(self.reluB1(self.normB1(self.convB1(x)))))
        C = self.dropout(self.poolC1(self.reluC1(self.normC1(self.convC1(x)))))
        x = torch.cat((A,B,C), dim=1)
        x = self.blend1(x)
        
        A = self.dropout(self.poolA2(self.reluA2(self.normA2(self.convA2(x)))))
        B = self.dropout(self.poolB2(self.reluB2(self.normB2(self.convB2(x)))))
        C = self.dropout(self.poolC2(self.reluC2(self.normC2(self.convC2(x)))))
        x = torch.cat((A,B,C), dim=1)
        x = self.blend2(x)
        
        A = self.dropout(self.poolA3(self.reluA3(self.normA3(self.convA3(x)))))
        B = self.dropout(self.poolB3(self.reluB3(self.normB3(self.convB3(x)))))
        C = self.dropout(self.poolC3(self.reluC3(self.normC3(self.convC3(x)))))
        x = torch.cat((A,B,C), dim=1)
        x = self.merge(x)
        x = x.view(x.size(0), -1)
        x = self.linear(x)

        return x


In [None]:
class WhoRedditTwo(nn.Module):

    def __init__(self):
        super(WhoRedditTwo, self).__init__()
        
        self.convA1 = nn.Conv1d(1, 32, 3, padding = 1)
        self.normA1 = nn.BatchNorm1d(32)
        self.reluA1 = nn.ReLU(True)
        self.poolA1 = nn.MaxPool1d(5, 5)
        
        self.convB1 = nn.Conv1d(1, 32, 5, padding = 2)
        self.normB1 = nn.BatchNorm1d(32)
        self.reluB1 = nn.ReLU(True)
        self.poolB1 = nn.MaxPool1d(5, 5)
        
        self.convC1 = nn.Conv1d(1, 32, 7, padding = 3)
        self.normC1 = nn.BatchNorm1d(32)
        self.reluC1 = nn.ReLU(True)
        self.poolC1 = nn.MaxPool1d(5, 5)
        
        self.blend1 = nn.Sequential(
            nn.Conv1d(3*32, 64, 3, padding = 1),
            nn.BatchNorm1d(64),
            nn.ReLU(True),
            nn.MaxPool1d(5,5),
            nn.Dropout(0.5)
        )
        
        # mini inception net block 2
        self.convA2 = nn.Conv1d(64, 64, 3, padding = 1)
        self.normA2 = nn.BatchNorm1d(64)
        self.reluA2 = nn.ReLU(True)
        self.poolA2 = nn.MaxPool1d(5, 5)
        
        self.convB2 = nn.Conv1d(64, 64, 5, padding = 2)
        self.normB2 = nn.BatchNorm1d(64)
        self.reluB2 = nn.ReLU(True)
        self.poolB2 = nn.MaxPool1d(5, 5)
        
        self.convC2 = nn.Conv1d(64, 64, 7, padding = 3)
        self.normC2 = nn.BatchNorm1d(64)
        self.reluC2 = nn.ReLU(True)
        self.poolC2 = nn.MaxPool1d(5, 5)
    
        self.blend2 = nn.Sequential(
            nn.Conv1d(3*64, 96, 3, padding = 1),
            nn.BatchNorm1d(96),
            nn.ReLU(True),
            nn.MaxPool1d(8,8)
        )
        
        # mini inception net block 3
        self.convA3 = nn.Conv1d(96, 96, 3, padding = 1)
        self.normA3 = nn.BatchNorm1d(96)
        self.reluA3 = nn.ReLU(True)
        self.poolA3 = nn.MaxPool1d(5, 5)
        
        # core modules
        self.convB3 = nn.Conv1d(96, 96, 5, padding = 2)
        self.normB3 = nn.BatchNorm1d(96)
        self.reluB3 = nn.ReLU(True)
        self.poolB3 = nn.MaxPool1d(5, 5)
        
        self.convC3 = nn.Conv1d(96, 96, 7, padding = 3)
        self.normC3 = nn.BatchNorm1d(96)
        self.reluC3 = nn.ReLU(True)
        self.poolC3 = nn.MaxPool1d(5, 5)
    
        self.merge = nn.Sequential(
            nn.Conv1d(3*96, 128, 3, padding = 1),
            nn.BatchNorm1d(128),
            nn.ReLU(True),
            nn.MaxPool1d(3,3),
            nn.Dropout(0.5),
        )
        
        self.linear = nn.Sequential(
            nn.Linear(79*128, 4096),
            nn.Dropout(0.3),
            nn.Linear(4096,20)
        )
            

    def forward(self, x):
        
        A = self.dropout(self.poolA1(self.reluA1(self.normA1(self.convA1(x)))))
        B = self.dropout(self.poolB1(self.reluB1(self.normB1(self.convB1(x)))))
        C = self.dropout(self.poolC1(self.reluC1(self.normC1(self.convC1(x)))))
        x = torch.cat((A,B,C), dim=1)
        x = self.blend1(x)
        
        A = self.dropout(self.poolA2(self.reluA2(self.normA2(self.convA2(x)))))
        B = self.dropout(self.poolB2(self.reluB2(self.normB2(self.convB2(x)))))
        C = self.dropout(self.poolC2(self.reluC2(self.normC2(self.convC2(x)))))
        x = torch.cat((A,B,C), dim=1)
        x = self.blend2(x)
        
        A = self.dropout(self.poolA3(self.reluA3(self.normA3(self.convA3(x)))))
        B = self.dropout(self.poolB3(self.reluB3(self.normB3(self.convB3(x)))))
        C = self.dropout(self.poolC3(self.reluC3(self.normC3(self.convC3(x)))))
        x = torch.cat((A,B,C), dim=1)
        x = self.merge(x)
        x = x.view(x.size(0), -1)
        x = self.linear(x)

        return x


In [None]:
#TODO: RECOMPUTE SIZES AND DOWNSAMPLING FOR NGRAM 1,2
class VanillaWhoReddit(nn.Module):

    def __init__(self):
        super(VanillaWhoReddit, self).__init__()
        
        # mini inception net block 1
        self.convA1 = nn.Conv1d(1, 64, 3, padding = 1)
        self.normA1 = nn.BatchNorm1d(64)
        self.reluA1 = nn.ReLU(True)
        
        self.convA2 = nn.Conv1d(64, 64, 3, padding = 1)
        self.normA2 = nn.BatchNorm1d(64)
        self.reluA2 = nn.ReLU(True)
        
        self.poolA = nn.MaxPool1d(4, 4)
        
        self.convB1 = nn.Conv1d(64, 128, 3, padding = 1)
        self.normB1 = nn.BatchNorm1d(128)
        self.reluB1 = nn.ReLU(True)
        
        self.convB2 = nn.Conv1d(128, 128, 3, padding = 1)
        self.normB2 = nn.BatchNorm1d(128)
        self.reluB2 = nn.ReLU(True)
        
        self.poolB = nn.MaxPool1d(4, 4)
        
        self.convC1 = nn.Conv1d(128, 256, 3, padding = 1)
        self.normC1 = nn.BatchNorm1d(256)
        self.reluC1 = nn.ReLU(True)
        
        self.convC2 = nn.Conv1d(256, 256, 3, padding = 1)
        self.normC2 = nn.BatchNorm1d(256)
        self.reluC2 = nn.ReLU(True)
        
        self.poolC = nn.MaxPool1d(4, 4)
        
        self.convD1 = nn.Conv1d(256, 512, 3, padding = 1)
        self.normD1 = nn.BatchNorm1d(512)
        self.reluD1 = nn.ReLU(True)
        
        self.convD2 = nn.Conv1d(512, 512, 3, padding = 1)
        self.normD2 = nn.BatchNorm1d(512)
        self.reluD2 = nn.ReLU(True)
        
        self.poolD = nn.MaxPool1d(4, 4)
        
        self.convE1 = nn.Conv1d(512, 256, 3, padding = 1)
        self.normE1 = nn.BatchNorm1d(256)
        self.reluE1 = nn.ReLU(True)
        
        self.convE2 = nn.Conv1d(256, 256, 3, padding = 1)
        self.normE2 = nn.BatchNorm1d(256)
        self.reluE2 = nn.ReLU(True)
        
        self.poolE = nn.MaxPool1d(4, 4)
        
        self.out = nn.Sequential(
            nn.Conv1d(256, 128, 3, padding = 1),
            nn.ReLU(True)
        )
        
        self.fc1 = nn.Linear(8448, 2048)
        self.fc2 =  nn.Linear(2048, 20)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        
        A = self.dropout(self.reluA1(self.normA1(self.convA1(x))))
        A = self.dropout(self.reluA2(self.normA2(self.convA2(A))))
        x = self.poolA(A)
        
        B = self.dropout(self.reluB1(self.normB1(self.convB1(x))))
        B = self.dropout(self.reluB2(self.normB2(self.convB2(B))))
        x = self.poolB(B)
        
        C = self.dropout(self.reluC1(self.normC1(self.convC1(x))))
        C = self.dropout(self.reluC2(self.normC2(self.convC2(C))))
        x = self.poolC(C)
        
        D = self.dropout(self.reluD1(self.normD1(self.convD1(x))))
        D = self.dropout(self.reluD2(self.normD2(self.convD2(D))))
        x = self.poolD(D)
        
        E = self.dropout(self.reluE1(self.normE1(self.convE1(x))))
        E = self.dropout(self.reluE2(self.normE2(self.convE2(E))))
        x = self.poolE(E)
        
        x = self.out(x)
        
        x = x.view(x.size(0), -1)
        x = self.fc2(self.fc1(x))

        return x


In [None]:
if NEW_MODEL:
    net = WhoRedditTwo().to(device)
else:
    net = VanillaWhoReddit().to(device)

net.apply(init_weights)

print(net)

### Loss and Optimizer

In [None]:
loss = nn.CrossEntropyLoss().type(device)
optimizer = optim.Adam(net.parameters(), lr=1e-4)

## Training

In [None]:
# split init
commentFolds = kFold(train_comments) 
commentFolds.generateSplits()
splits = commentFolds.splits
x,y,z = splits[0]
print(x.shape, y.shape, z.shape)

In [None]:
epochs = 15

In [None]:
for idx, split in enumerate(splits): # split
    
    train, val, test = split
    
    # Dataset obj
    train_set = CommentData(train)
    val_set = CommentData(val)
    test_set = CommentData(test)
    
    # Data loaders
    train_loader = torch.utils.data.DataLoader(train_set, batch_size=1, num_workers=8, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_set, batch_size=1, num_workers=8)
    test_loader = torch.utils.data.DataLoader(test_set, batch_size=1, num_workers=8)
    
    # train cycle here
    for epoch in range(epochs):
        
        net.train()
        running_loss = 0.0
        correct = 0.
        total = 0.
        
        for i, (comment, target) in enumerate(train_loader):
            
            # tensor to device
            comment = comment.to(device=device, dtype=torch.float32).unsqueeze(1)
            target = target.to(device=device, dtype=torch.int64)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            output = net(comment)
            error = loss(output, target)
            error.backward()
            optimizer.step()

            # print statistics
            running_loss += error.item()
            if i % 500 == 499:    # print every 50 mini-batches
                print('[%d, %5d] loss: %.5f' %
                      (epoch + 1, i + 1, running_loss / 500))
                running_loss = 0.0

            # Get predictions
            preds = F.softmax(output, dim=1)
            preds_cls = preds.argmax(dim=1)

            # Count number of correct predictions
            correct_preds = torch.eq(preds_cls, target)
            correct += torch.sum(correct_preds).cpu().item()
            total += len(correct_preds)

        train_acc = correct / total
        print("Epoch:", epoch+1,"Training Acc:",train_acc)

        net.eval()
        correct = 0.
        total = 0.
        
        for i, (comment, target) in enumerate(val_loader):

            comment = comment.to(device=device, dtype=torch.float32).unsqueeze(1)
            target = target.to(device=device, dtype=torch.int64)
            output = net(comment)

            # Get predictions
            preds = F.softmax(output, dim=1)
            preds_cls = preds.argmax(dim=1)

            # Count number of correct predictions
            correct_preds = torch.eq(preds_cls, target)
            correct += torch.sum(correct_preds).cpu().item()
            total += len(correct_preds)

        valid_acc = correct / total
        print("Epoch:", epoch+1,"Validation Acc:",valid_acc)
        
        net.eval()
        correct = 0.
        total = 0.
        
        for i, (comment, target) in enumerate(test_loader):

            comment = comment.to(device=device, dtype=torch.float32).unsqueeze(1)
            target = target.to(device=device, dtype=torch.int64)
            output = net(comment)

            # Get predictions
            preds = F.softmax(output, dim=1)
            preds_cls = preds.argmax(dim=1)

            # Count number of correct predictions
            correct_preds = torch.eq(preds_cls, target)
            correct += torch.sum(correct_preds).cpu().item()
            total += len(correct_preds)

        test_acc = correct / total
        print("Epoch:", epoch+1,"Test Acc:",test_acc)
        
        torch.save(net.state_dict(), './model'+ str(epoch) + model_type)

    print('Finished Training')

    # terminate cycle
    if idx-1 >= FOLD_USE:
        break

# Test

## Get Predictions

In [None]:
# running through ex:
tester = TestData(test_comments)
loader = torch.utils.data.DataLoader(tester, batch_size=1, num_workers=2)

In [None]:
test_labels = []

net.eval()
for i, comment in enumerate(loader):
    
    comment = comment.to(device=device, dtype=torch.float32).unsqueeze(1)
    
    output = net(comment)
    pred = F.softmax(output, dim=1)
    index = pred.argmax(dim=1)
    
    test_labels.append(labels[index])

## Write to CSV

In [None]:
test_output = pd.read_csv(test_data)
test_output['subreddits'] = test_labels
test_output.to_csv('results.csv')

print(test_output)