# Imports

In [1]:
# generic
import os

# Data management
import csv
import pandas as pd

#nlp
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from spellchecker import SpellChecker
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import normalize
from nltk.stem import SnowballStemmer
from sklearn.utils import shuffle

#base classifier
from sklearn.naive_bayes import MultinomialNB


# Math and plots
import numpy as np
import random
import matplotlib.pyplot as plt

# Flags

In [2]:
# preprocessing
REGEX = True
SPELL_CHECK = False
STOP_WORDS = False
LEMMATIZE = False
STEM = False

# flag selection

# Data Loader

In [3]:
# filepaths
train_data = './Data/reddit_train.csv'
test_path = './Data/reddit_test.csv'

# gloabal labels
labels = ['hockey', 'nba', 'leagueoflegends', 'soccer', \
          'funny', 'movies', 'anime', 'Overwatch', 'trees', \
          'GlobalOffensive', 'nfl', 'AskReddit', 'gameofthrones', \
          'conspiracy', 'worldnews', 'wow', 'europe', 'canada', \
          'Music', 'baseball']

## Cleaning and Preprocessing

In [4]:
#load
comment_data = pd.read_csv(train_data)
comment_data = shuffle(comment_data)

#load
test_data = pd.read_csv(test_path)

In [5]:
if REGEX:
    #clean
    comment_data['prep'] = comment_data['comments'].str.replace(r'[^\w\s]+', ' ')
    comment_data['prep'] = comment_data['prep'].str.lower()
    comment_data['prep'] = comment_data['prep'].str.replace('(\d+)', ' num ')
    comment_data['prep'] = comment_data['prep'].str.replace(r'http(?<=http).*', ' wasurl ')
    comment_data['prep'] = comment_data['prep'].str.replace(r'\s+', " ")
    comment_data['prep'] = comment_data['prep'].str.replace(" +", " ")


    #clean
    test_data['prep'] = test_data['comments'].str.replace(r'[^\w\s]+', ' ')
    test_data['prep'] = test_data['prep'].str.lower()
    test_data['prep'] = test_data['prep'].str.replace('(\d+)', ' num ')
    test_data['prep'] = test_data['prep'].str.replace(r'http(?<=http).*', ' wasurl ')
    test_data['prep'] = test_data['prep'].str.replace(r'\s+', " ")
    test_data['prep'] = test_data['prep'].str.replace(" +", " ")

In [6]:
tt = TweetTokenizer()

In [7]:
if SPELL_CHECK:
    #spellcheck
    spell = SpellChecker(distance=1)
    def spellcheck_col(row):
        row = tt.tokenize(row)
        return " ".join([spell.correction(word) for word in row])

    comment_data['prep'] = comment_data.prep.apply(spellcheck_col)
    test_data['prep'] = test_data.prep.apply(spellcheck_col)

if STOP_WORDS:
    # stopwords
    stop = stopwords.words('english')
    comment_data['prep'] = comment_data.prep.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    test_data['prep'] = test_data.prep.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

if LEMMATIZE:
    # lemmatization
    lemmatizer = WordNetLemmatizer()
    def lemmatize_col(row):
        row = tt.tokenize(row)
        return " ".join([lemmatizer.lemmatize(w) for w in row])

    comment_data['prep'] = comment_data.prep.apply(lemmatize_col)
    test_data['prep'] = test_data.prep.apply(lemmatize_col)

if STEM:
    #semmer
    stemmer = SnowballStemmer('english')
    def stem_col(row):
        row = tt.tokenize(row)
        return " ".join([stemmer.stem(word) for word in row])

    comment_data['prep'] = comment_data.prep.apply(stem_col)
    test_data['prep'] = test_data.prep.apply(stem_col)


# 85/15 Split - Basic Testing

In [8]:
## NOTE: TO USE PREPROC, SET COMMENT_DATA['prep'] instead of comment_data['comments']

In [9]:
#split data
clean_data = comment_data['comments'].to_numpy()
clean_labels = comment_data['subreddits'].to_numpy()
# training_data = clean_data[:60000]
# testing_data = clean_data[60000:]
# training_labels = clean_labels[:60000]
# testing_labels = clean_labels[60000:]

training_data = clean_data
training_labels = clean_labels
testing_data = test_data['comments'].to_numpy()

print(training_data.shape)
print(testing_data.shape)
print(training_labels.shape)

# print(training_data[0])

(70000,)
(30000,)
(70000,)


In [10]:
# # count vectors w/ norm
# counter = CountVectorizer(min_df=1, max_df=0.05, ngram_range=(1,1))
# training_counts = counter.fit_transform(training_data)
# testing_counts = counter.transform(testing_data)

# print(training_counts.shape)
# print(testing_counts.shape)

# # tokenize and remove min words on "training set"
# tfidf = TfidfTransformer()
# training_vec = tfidf.fit_transform(training_counts)
# testing_vec = tfidf.transform(testing_counts)

# print(training_vec.shape)
# print(testing_vec.shape)

In [11]:
#BEST
tfidf = TfidfVectorizer(sublinear_tf=True, smooth_idf=False)
training_vec = tfidf.fit_transform(training_data)
testing_vec = tfidf.transform(testing_data)

print(training_vec.shape)
print(testing_vec.shape)

(70000, 74265)
(30000, 74265)


In [21]:
# reduce feature space by percentile
from sklearn.feature_selection import SelectPercentile, f_classif
selector = SelectPercentile(f_classif, percentile=90)
training_vec = selector.fit_transform(training_vec, training_labels)
testing_vec = selector.transform(testing_vec)

print(training_vec.shape)
print(testing_vec.shape)

(70000, 66838)
(30000, 66838)


## Test

In [12]:
clf = MultinomialNB(alpha=0.1, fit_prior=True)
clf.fit(training_vec, training_labels)
results = clf.predict(testing_vec)

print(results.shape)
pd.DataFrame(results).to_csv("results.csv")

# r = np.array([0 for i in range(20)])
# w = np.array([0 for i in range(20)])
# count = 0
# for idx, result in enumerate(results):
#     l = labels.index(testing_labels[idx])
#     w[l] += 1
#     if result == testing_labels[idx]:
#         count += 1
#         r[l] += 1
# print(r/w)
# print(count/len(results))        

(30000,)
