# Imports

In [2]:
# generic
import os

# Data management
import csv
import pandas as pd

#nlp
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from spellchecker import SpellChecker
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import normalize
from nltk.stem import SnowballStemmer
from sklearn.utils import shuffle

#base classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import SelectPercentile, f_classif


# Math and plots
import numpy as np
import random
import matplotlib.pyplot as plt

# Flags

In [3]:
# preprocessing
REGEX = True
SPELL_CHECK = False
STOP_WORDS = False
LEMMATIZE = False
STEM = False

# flag selection
PREDICT_TEST_SET = False

# Data Loader

In [4]:
# filepaths
train_data = './Data/reddit_train.csv'
test_path = './Data/reddit_test.csv'

# gloabal labels
labels = ['hockey', 'nba', 'leagueoflegends', 'soccer', \
          'funny', 'movies', 'anime', 'Overwatch', 'trees', \
          'GlobalOffensive', 'nfl', 'AskReddit', 'gameofthrones', \
          'conspiracy', 'worldnews', 'wow', 'europe', 'canada', \
          'Music', 'baseball']

## Cleaning and Preprocessing

In [5]:
#load
comment_data = pd.read_csv(train_data)
comment_data = shuffle(comment_data)

#load
test_data = pd.read_csv(test_path)

In [6]:
if REGEX:
    #clean
    comment_data['prep'] = comment_data['comments'].str.replace(r'[^\w\s]+', ' ')
    comment_data['prep'] = comment_data['prep'].str.lower()
    comment_data['prep'] = comment_data['prep'].str.replace('(\d+)', ' num ')
    comment_data['prep'] = comment_data['prep'].str.replace(r'http(?<=http).*', ' wasurl ')
    comment_data['prep'] = comment_data['prep'].str.replace(r'\s+', " ")
    comment_data['prep'] = comment_data['prep'].str.replace(" +", " ")


    #clean
    test_data['prep'] = test_data['comments'].str.replace(r'[^\w\s]+', ' ')
    test_data['prep'] = test_data['prep'].str.lower()
    test_data['prep'] = test_data['prep'].str.replace('(\d+)', ' num ')
    test_data['prep'] = test_data['prep'].str.replace(r'http(?<=http).*', ' wasurl ')
    test_data['prep'] = test_data['prep'].str.replace(r'\s+', " ")
    test_data['prep'] = test_data['prep'].str.replace(" +", " ")

In [7]:
tt = TweetTokenizer()

In [8]:
if SPELL_CHECK:
    #spellcheck
    spell = SpellChecker(distance=1)
    def spellcheck_col(row):
        row = tt.tokenize(row)
        return " ".join([spell.correction(word) for word in row])

    comment_data['prep'] = comment_data.prep.apply(spellcheck_col)
    test_data['prep'] = test_data.prep.apply(spellcheck_col)

if STOP_WORDS:
    # stopwords
    stop = stopwords.words('english')
    comment_data['prep'] = comment_data.prep.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    test_data['prep'] = test_data.prep.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

if LEMMATIZE:
    # lemmatization
    lemmatizer = WordNetLemmatizer()
    def lemmatize_col(row):
        row = tt.tokenize(row)
        return " ".join([lemmatizer.lemmatize(w) for w in row])

    comment_data['prep'] = comment_data.prep.apply(lemmatize_col)
    test_data['prep'] = test_data.prep.apply(lemmatize_col)

if STEM:
    #semmer
    stemmer = SnowballStemmer('english')
    def stem_col(row):
        row = tt.tokenize(row)
        return " ".join([stemmer.stem(word) for word in row])

    comment_data['prep'] = comment_data.prep.apply(stem_col)
    test_data['prep'] = test_data.prep.apply(stem_col)


# 85/15 Split - Basic Testing

In [8]:
## NOTE: TO USE PREPROC, SET COMMENT_DATA['prep'] instead of comment_data['comments']

In [22]:
#split data
clean_data = comment_data['comments'].to_numpy()
clean_labels = comment_data['subreddits'].to_numpy()
# training_data = clean_data[:60000]
# testing_data = clean_data[60000:]
# training_labels = clean_labels[:60000]
# testing_labels = clean_labels[60000:]

training_data = clean_data
training_labels = clean_labels
testing_data = test_data['comments'].to_numpy()

print(training_data.shape)
print(testing_data.shape)
print(training_labels.shape)

# print(training_data[0])

(70000,)
(30000,)
(70000,)


In [23]:
#BEST
tfidf = TfidfVectorizer(sublinear_tf=True, smooth_idf=False)
training_vec = tfidf.fit_transform(training_data)
testing_vec = tfidf.transform(testing_data)

print(training_vec.shape)
print(testing_vec.shape)

(70000, 74265)
(30000, 74265)


In [24]:
# reduce feature space by percentile
selector = SelectPercentile(f_classif, percentile=90)
training_vec = selector.fit_transform(training_vec, training_labels)
testing_vec = selector.transform(testing_vec)

print(training_vec.shape)
print(testing_vec.shape)

(70000, 66838)
(30000, 66838)


## Test

In [25]:
clf = MultinomialNB(alpha=0.1, fit_prior=True)
clf.fit(training_vec, training_labels)
results = clf.predict(testing_vec)

print(results.shape)
pd.DataFrame(results).to_csv("results.csv")

# r = np.array([0 for i in range(20)])
# w = np.array([0 for i in range(20)])
# count = 0
# for idx, result in enumerate(results):
#     l = labels.index(testing_labels[idx])
#     w[l] += 1
#     if result == testing_labels[idx]:
#         count += 1
#         r[l] += 1
# print(r/w)
# print(count/len(results))        

(30000,)


# K-Fold Cross-Validation Pipeline

In [26]:
# leverages pandas for fast csv load but operates in numpy
class kFold():
    def __init__(self, data, numFolds=5):
        self.data = data
        self.numFolds = numFolds
        self.splits = []
        
    def generateSplits(self):
        np.random.shuffle(self.data)
        
        folds = []
        splitPoint = self.data.shape[0] // (self.numFolds)  #breakpoint index jump
        
        for i in range(self.numFolds - 1):
            folds.append(self.data[i*splitPoint:(i+1)*splitPoint, :])
            
        folds.append(self.data[(i+1)*splitPoint:,:]) #get extra points in last batch
        
        # create split permutations 80/10/10
        foldDivisor = len(folds[0]) // 2
        for i in range(self.numFolds):
            train = []
            for k in range(self.numFolds):
                if i == k:
                    validation = folds[i][:foldDivisor] 
                    test = folds[i][foldDivisor:] 
                else:
                    train.append(folds[k])
            
            train = np.vstack(train) # adapt dims
            self.splits.append((train, validation, test))

In [27]:
# helper function: unpacks data that was zipped together when we shuffled
def unpack(subset):
    data = []
    labels = []

    for x,y in subset:
        data.append(x)
        labels.append(y)

    data = np.array(data)
    labels = np.array(labels)
    
    return (data, labels)

In [28]:
# reformat
clean_data = comment_data['comments'].to_numpy()
clean_labels = comment_data['subreddits'].to_numpy()

#pack tuple
comments = []
for idx in range(clean_data.shape[0]):
    item = (clean_data[idx], clean_labels[idx])
    comments.append(item)
comments = np.asarray(comments)
print(comments.shape)

(70000, 2)


In [29]:
# make splits
commentFolds = kFold(comments)
commentFolds.generateSplits()
splits = commentFolds.splits
trainex, valex, testex = splits[0]
print(trainex.shape)
print(valex.shape)
print(testex.shape)

(56000, 2)
(7000, 2)
(7000, 2)


In [35]:
for s, split in enumerate(splits):
    
    # unpack segements
    train, val, test = split
    
    # unpack data and labels
    training_data, training_labels = unpack(train)
    validation_data, validation_labels = unpack(val)
    testing_data, testing_labels = unpack(test)
    
    # tokenize and remove min words on "training set"
    tfidf = TfidfVectorizer(sublinear_tf=True, smooth_idf=False)
    training_vec = tfidf.fit_transform(training_data)
    validation_vec = tfidf.transform(validation_data)
    testing_vec = tfidf.transform(testing_data)
    
    # reduce feature space by percentile
    selector = SelectPercentile(f_classif, percentile=90)
    training_vec = selector.fit_transform(training_vec, training_labels)
    validation_vec = selector.transform(validation_vec)
    testing_vec = selector.transform(testing_vec)

    # fit model on training set
    nb = MultinomialNB(alpha=0.1, fit_prior=True)
    nb.fit(training_vec,training_labels)
    
    num_correct = 0
    results = nb.predict(validation_vec)
    for idx, result in enumerate(results):
        score = labels.index(validation_labels[idx])
        if result == validation_labels[idx]:
            num_correct += 1
    print("Fold ", s+1, " Validation accuracy is: " , num_correct/(validation_vec.shape[0]))
    
    num_correct = 0
    results = nb.predict(testing_vec)
    for idx, result in enumerate(results):
        score = labels.index(testing_labels[idx])
        if result == testing_labels[idx]:
            num_correct += 1
    print("Fold ", s+1, " Testing accuracy is: " , num_correct/(testing_vec.shape[0]))

Fold  1  Validation accuracy is:  0.5731428571428572
Fold  1  Testing accuracy is:  0.5652857142857143
Fold  2  Validation accuracy is:  0.5735714285714286
Fold  2  Testing accuracy is:  0.575
Fold  3  Validation accuracy is:  0.555
Fold  3  Testing accuracy is:  0.5805714285714285
Fold  4  Validation accuracy is:  0.5724285714285714
Fold  4  Testing accuracy is:  0.5752857142857143
Fold  5  Validation accuracy is:  0.5727142857142857
Fold  5  Testing accuracy is:  0.5637142857142857
