# Text Preprocessing for Reddit Comment Classification

Outstanding TODOs:
* incorporate meta-feature and POS tags into feature list
* run tfidf with more computing power so that n-grams can be increased to 2 or 3

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.tag import pos_tag_sents
from nltk.stem import WordNetLemmatizer
import string
from spellchecker import SpellChecker

comment_data = pd.read_csv('../Data/reddit_train.csv')
test_data = pd.read_csv('../Data/reddit_test.csv')
print(comment_data)
print(test_data)

          id                                           comments  \
0          0  Honestly, Buffalo is the correct answer. I rem...   
1          1  Ah yes way could have been :( remember when he...   
2          2  https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...   
3          3  He wouldn't have been a bad signing if we woul...   
4          4  Easy. You use the piss and dry technique. Let ...   
...      ...                                                ...   
69995  69995  Thank you, you confirm Spain does have nice pe...   
69996  69996  Imagine how many he would have killed with a r...   
69997  69997  Yes. Only. As in the guy I was replying to was...   
69998  69998  Looking for something light-hearted or has a v...   
69999  69999  I love how I never cry about casters because I...   

            subreddits  
0               hockey  
1                  nba  
2      leagueoflegends  
3               soccer  
4                funny  
...                ...  
69995           euro

### Cleanup of Input Data
Standardize the text snippets of the comments, preparing the comments for further analysis.

In [2]:
# punctuation removal
comment_data['prep'] = comment_data['comments'].str.replace(r'[^\w\s]+', '')
test_data['prep'] = test_data['comments'].str.replace(r'[^\w\s]+', '')

# lowercase
comment_data['prep'] = comment_data['prep'].str.lower()
test_data['prep'] = test_data['prep'].str.lower()

# convert numbers to 'num'
comment_data['prep'] = comment_data['prep'].str.replace('(\d+)', ' num ')
test_data['prep'] = test_data['prep'].str.replace('(\d+)', ' num ')

# replace links with 'wasurl'
comment_data['prep'] = comment_data['prep'].str.replace(r'http(?<=http).*', ' wasurl ')
test_data['prep'] = test_data['prep'].str.replace(r'http(?<=http).*', ' wasurl ')

# replace newlines and tabs with spaces
comment_data['prep'] = comment_data['prep'].str.replace(r'\s+', " ")
test_data['prep'] = test_data['prep'].str.replace(r'\s+', " ")

# fix any double spaces we created in the previous steps
comment_data['prep'] = comment_data['prep'].str.replace(" +", " ")
test_data['prep'] = test_data['prep'].str.replace(" +", " ")

# # typo correction (commented out because it takes too long)
# spell = SpellChecker()
# def spellcheck_col(row):
#     return [spell.correction(word) for word in row]
# comment_data['prep'] = comment_data.prep.apply(spellcheck_col)
# test_data['prep'] = test_data.prep.apply(spellcheck_col)

# lemmatization
lemmatizer = WordNetLemmatizer()
tt = TweetTokenizer()
def lemmatize_col(row):
    row = tt.tokenize(row)
    return ' '.join([lemmatizer.lemmatize(w) for w in row])
comment_data['prep'] = comment_data.prep.apply(lemmatize_col)
test_data['prep'] = test_data.prep.apply(lemmatize_col)

print(comment_data)
print(test_data)

          id                                           comments  \
0          0  Honestly, Buffalo is the correct answer. I rem...   
1          1  Ah yes way could have been :( remember when he...   
2          2  https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...   
3          3  He wouldn't have been a bad signing if we woul...   
4          4  Easy. You use the piss and dry technique. Let ...   
...      ...                                                ...   
69995  69995  Thank you, you confirm Spain does have nice pe...   
69996  69996  Imagine how many he would have killed with a r...   
69997  69997  Yes. Only. As in the guy I was replying to was...   
69998  69998  Looking for something light-hearted or has a v...   
69999  69999  I love how I never cry about casters because I...   

            subreddits                                               prep  
0               hockey  honestly buffalo is the correct answer i remem...  
1                  nba  ah yes way could ha

### Part-of-Speech Count, TF-IDF (with stopword removal), and Meta-Feature Extraction
Use POS tagging, TF-IDF, and meta-feature extraction to generate feature vectors.

In [None]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from pymfe.mfe import MFE

# tag comments
# comment_data['comments_tagged'] = pos_tag_sents(comment_data['prep'].tolist())
# test_data['comments_tagged'] = pos_tag_sents(test_data['prep'].tolist())
# TODO count totals of each part of speech (noun, adjective, etc) and use the counts as features


# in the bag of words matrix, remove punctuation and stopwords
# count_vectorizer = CountVectorizer(tokenizer=tt.tokenize, stop_words="english", ngram_range=(1,3))
# counts = count_vectorizer.fit_transform(comment_data.comments)
# print("raw word count matrix size: " + str(counts.shape))
# count_t_vectorizer = CountVectorizer(tokenizer=tt.tokenize, stop_words="english", ngram_range=(1,3))
# counts_t = count_t_vectorizer.fit_transform(test_data.comments)
# print("raw word count matrix size: " + str(counts_t.shape))


# TF-IDF, also removing punctuation and stopwords
tfidf_vectorizer = TfidfVectorizer(tokenizer=tt.tokenize, stop_words="english", ngram_range=(1,1))
tfidf = tfidf_vectorizer.fit_transform(comment_data.prep)
print("TF-IDF matrix size: " + str(tfidf.shape))
tfidf_t = tfidf_vectorizer.transform(test_data.prep)
print("TF-IDF_t matrix size: " + str(tfidf_t.shape))


# meta-feature extraction
# comment_data['word_count'] = comment_data['comments'].apply(lambda x: len(str(x).split(" ")))
# test_data['word_count'] = test_data['comments'].apply(lambda x: len(str(x).split(" ")))
# comment_data['char_count'] = comment_data['comments'].str.len()
# test_data['char_count'] = test_data['comments'].str.len()

# np.sparse.hstack((tfidf,comment_data['word_count'].to_numpy())[:,None]).A
# np.sparse.hstack((tfidf,comment_data['char_count'].to_numpy())[:,None]).A
# np.sparse.hstack((tfidf_t,test_data['word_count'].to_numpy())[:,None]).A
# np.sparse.hstack((tfidf_t,test_data['char_count'].to_numpy())[:,None]).A

# mfe = MFE(groups=["general", "statistical", "info-theory"], summary=["min", "median", "max"])
# # TODO remove .head(1000) (currently included because memory limit doesn't allow computation of full list)
# mfe.fit(comment_data['comments'].head(1000).tolist(), comment_data['subreddits'].head(1000).tolist())
# ft = mfe.extract()
# print(ft)

TF-IDF matrix size: (70000, 68455)


#### File Export
Dump the vectorizer, feature matrices, and dataframes to a pickle file in order for other group members to use them.

In [4]:
import pickle

# vectorizer
with open('../Data/vectorizer.pk', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)

# feature matrices (load on the other side with scipy.sparse.load_npz())
scipy.sparse.save_npz('../Data/feature_matrix_train.npz', tfidf)
scipy.sparse.save_npz('../Data/feature_matrix_test.npz', tfidf_t)

# dataframes of comment data
np.save('../Data/cleaned_data_train', comment_data.to_numpy())
np.save('../Data/cleaned_data_test', test_data.to_numpy())

NameError: name 'tfidf_vectorizer' is not defined

### Feature Selection
Pick the important information out of the large matrix generated above. Only necessary if the number of features is so high that the models don't have enough computation power to train, OR when some features are providing poor information and throwing off results.

In [None]:
# # perform PCA on the TF-IDF matrix
# from sklearn.decomposition import TruncatedSVD

# tsvd = TruncatedSVD(n_components=20)
# reduced_features = tsvd.fit_transform(tfidf)
# print(tsvd.explained_variance_ratio_)

In [None]:
# from sklearn.feature_selection import chi2

# N = 2
# for Product, category_id in sorted(category_to_id.items()):
#   features_chi2 = chi2(tfidf, labels == category_id)
#   indices = np.argsort(features_chi2[0])
#   feature_names = np.array(tfidf.get_feature_names())[indices]
#   unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
#   bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
#   print("# '{}':".format(Product))
#   print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
#   print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

### Models
SciKit-Learn implementations of decision tree and Multinomial Naive Bayes.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

# decision tree on test set
d_tree = DecisionTreeClassifier(random_state=0).fit(tfidf, comment_data['subreddits'])
d_tree_predict = d_tree.predict(tfidf_t)
print("Decision Tree test set predictions:")
print(d_tree_predict)

# niave bayes on test set
nb = MultinomialNB().fit(tfidf, comment_data['subreddits'])
nb_predict = nb.predict(tfidf_t)
print("Naive Bayes test set predictions:")
print(nb_predict)

# check accuracy using same training set values (overfitting, this is just a quick-and-dirty check)
sanity_score = d_tree.score(tfidf, comment_data['subreddits'])
print("Sanity check - decision tree accuracy on training set is: " + str(sanity_score))

# make a validation set to properly check accuracy
X_train, X_val, y_train, y_val = train_test_split(comment_data['comments'], comment_data['subreddits'], random_state = 0)
x_train_features = tfidf_vectorizer.transform(X_train)
x_val_features = tfidf_vectorizer.transform(X_val)
d_tree_val = DecisionTreeClassifier(random_state=0).fit(x_train_features, y_train)
d_tree_score = d_tree_val.score(x_val_features, y_val)
print("Decision tree validation score: " + str(d_tree_score))
nb_val = MultinomialNB().fit(x_train_features, y_train)
nb_score = nb_val.score(x_val_features, y_val)
print("Naive Bayes validation score: " + str(nb_score))

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_val = RandomForestClassifier(n_estimators=10).fit(x_train_features, y_train)
rf_score = rf_val.score(x_val_features, y_val)
print("Random Forest validation score: " + str(rf_score))

### One-Hot Encoding of Classifiers
To be used for the models implemented from scratch.

In [None]:
# categories
labels = ['hockey', 'nba', 'leagueoflegends', 'soccer', \
          'funny', 'movies', 'anime', 'Overwatch' 'trees', \
          'GlobalOffensive', 'nfl', 'AskReddit', 'gameofthrones', \
          'conspiracy', 'worldnews', 'wow', 'europe', 'canada', \
          'Music', 'baseball']


# default setup
full_reference = np.eye(len(labels))
partial_reference = np.append(np.zeros((1,len(labels)-1), dtype=np.int8),\
                              np.eye(len(labels)-1, dtype=np.int8), axis = 0)

# encoder
def encode(label, labels=labels, ref=full_reference):
    location = labels.index(label)
    return ref[location]


# shapes
print(full_reference.shape)
print(partial_reference.shape)

# example
print(encode('hockey'))