# Text Preprocessing for Reddit Comment Classification

In [12]:
# basic
import numpy as np
import scipy
import pandas as pd
import nltk
import string
import csv
from spellchecker import SpellChecker
from collections import Counter

# natural language toolkit
from nltk.tokenize import TweetTokenizer
from nltk.tag import pos_tag_sents
from nltk.stem import WordNetLemmatizer

# SciKit-Learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

# meta-feature extraction
from pymfe.mfe import MFE

# import data
comment_data = pd.read_csv('../Data/reddit_train.csv')
test_data = pd.read_csv('../Data/reddit_test.csv')
print(comment_data.head())
print(test_data.head())

   id                                           comments       subreddits
0   0  Honestly, Buffalo is the correct answer. I rem...           hockey
1   1  Ah yes way could have been :( remember when he...              nba
2   2  https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...  leagueoflegends
3   3  He wouldn't have been a bad signing if we woul...           soccer
4   4  Easy. You use the piss and dry technique. Let ...            funny
   id                                           comments
0   0  Trout and Bryant have both led the league in s...
1   1  &gt; Just like Estonians have good reasons to ...
2   2  Will Sol_Primeval sotp being oblivious?\n\nfin...
3   3  Moving Ostwald borders back to the pre 1967 bo...
4   4         You have to take it out of the bag, Morty!


### Pre-Processing Functions
#### TF-IDF Vectorizer
Using only one vectorizer for all preprocessing calls means that each feature matrix has the same number of features (which is required for the inputs of the classifier algorithms to match).

In [2]:
def get_vectorizer(train_data, test_data):
    tfidf_vectorizer = TfidfVectorizer(tokenizer=tt.tokenize, stop_words="english", ngram_range=(1,2), min_df=2)
    tfidf_vectorizer.fit(pd.concat([train_data['prep'], test_data['prep']]))
    return tfidf_vectorizer

#### Text Pre-Processing
Input: Pandas dataframe with a column called "comments" containing comments

Output: sparse matrix of features

In [3]:
# helper function for spelling correction
tt = TweetTokenizer()
spell = SpellChecker(distance=1)  # set distance to 1 instead of the default 2 to speed things up
def spellcheck_col(row):
    return " ".join([spell.correction(word) for word in tt.tokenize(row)])

# helper function for lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatize_col(row):
    return " ".join([lemmatizer.lemmatize(w) for w in tt.tokenize(row)])

# helper function for average word count
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

# stopwords for stopword count
stopwords = nltk.corpus.stopwords.words('english')

# object for meta-feature extraction
mfe = MFE(groups=["general", "statistical", "info-theory"], summary=["min", "median", "max"])

def text_cleanup(data):
    ##### CLEANUP OF INPUT DATA #####
    # punctuation removal
    data['prep'] = data['comments'].str.replace(r'[^\w\s]+', '')

    # lowercase
    data['prep'] = data['prep'].str.lower()

    # convert numbers to 'num'
    data['prep'] = data['prep'].str.replace('(\d+)', ' num ')

    # replace links with 'wasurl'
    data['prep'] = data['prep'].str.replace(r'http(?<=http).*', ' wasurl ')

    # replace newlines and tabs with spaces
    data['prep'] = data['prep'].str.replace(r'\s+', " ")

    # fix any double spaces we created in the previous steps
    data['prep'] = data['prep'].str.replace(" +", " ")
    print("Superficial standardization complete")

    # typo correction
    data['prep'] = data.prep.apply(spellcheck_col)
    print("Typo correction complete")

    # lemmatization
    data['prep'] = data.prep.apply(lemmatize_col)
    print("Lemmatization complete")
    
    return data

def preprocess(inFrame):
    data = inFrame.copy(deep=True)
    
    ##### META-FEATURE EXTRACTION #####
    # word count
    data['word_count'] = data['comments'].apply(lambda x: len(str(x).split(" ")))
    wc = scipy.sparse.csr_matrix.transpose(scipy.sparse.csr_matrix(data['word_count'].to_numpy()))

    # character count
    data['char_count'] = data['comments'].str.len()
    # cc = scipy.sparse.csr_matrix.transpose(scipy.sparse.csr_matrix(data['char_count'].to_numpy()))
    # TODO fix issue that is including NaNs in the char_count list

    # average word length
    data['avg_word'] = data['comments'].apply(lambda x: avg_word(x))
    aw = scipy.sparse.csr_matrix.transpose(scipy.sparse.csr_matrix(data['avg_word'].to_numpy()))

    # stopword count (stopwords will be removed later)
    data['stop_count'] = data['comments'].apply(lambda x: len([x for x in x.split() if x in stopwords]))
    sc = scipy.sparse.csr_matrix.transpose(scipy.sparse.csr_matrix(data['stop_count'].to_numpy()))

    # digit count
    data['digit_count'] = data['comments'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
    dc = scipy.sparse.csr_matrix.transpose(scipy.sparse.csr_matrix(data['digit_count'].to_numpy()))
    
    # mathematical meta-feature extraction
    # mfe.fit(comment_data['comments'].tolist(), comment_data['subreddits'].tolist())
    # ft = mfe.extract()
    
    
    ##### PART-OF-SPEECH TAGGING #####
    # data['pos_tag'] = pos_tag_sents(data['prep'].tolist())
    # TODO count totals of each part of speech (noun, adjective, etc) and use the counts as features
    
    
    ##### TF-IDF #####
    tfidf = tfidf_vectorizer.transform(data.prep)
    
    
    ##### FEATURE COMBINATION #####
    feature_matrix = scipy.sparse.hstack((tfidf, wc, aw, sc, dc))
    
    
    ##### FEATURE SELECTION #####
    # TODO if necessary, reduce the number of features by selecting the most informative ones
    
    return feature_matrix

### Training and Test Data Split

In [4]:
# clean up comments
comment_data = text_cleanup(comment_data)
print(comment_data.head())
test_data = text_cleanup(test_data)
print(test_data.head())

# get tfidf vectorizer
tfidf_vectorizer = get_vectorizer(comment_data, test_data)

# whole training set (for use when making predictions for competition submission)
full_matrix_train = preprocess(comment_data)
print("Full matrix shape:")
print(full_matrix_train.shape)
full_matrix_test = preprocess(test_data)
print("Test matrix shape:")
print(full_matrix_test.shape)

# split up training set (for use when evaluating model accuracies)
X_train = preprocess(comment_data.head(55000))
print("Training matrix shape:")
print(X_train.shape)
X_val = preprocess(comment_data.tail(15000))
print("Validation matrix shape:")
print(X_val.shape)
y_train = comment_data['subreddits'].head(55000)
y_val = comment_data['subreddits'].tail(15000)

Superficial standardization complete
Typo correction complete
Lemmatization complete
   id                                           comments       subreddits  \
0   0  Honestly, Buffalo is the correct answer. I rem...           hockey   
1   1  Ah yes way could have been :( remember when he...              nba   
2   2  https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...  leagueoflegends   
3   3  He wouldn't have been a bad signing if we woul...           soccer   
4   4  Easy. You use the piss and dry technique. Let ...            funny   

                                                prep  
0  honestly buffalo is the correct answer i remem...  
1  ah yes way could have been remember when he wa...  
2  wasurl if you dint find it already nothing out...  
3  he couldnt have been a bad signing if we could...  
4  easy you use the piss and dry technique let a ...  
Superficial standardization complete
Typo correction complete
Lemmatization complete
   id                                

### Models
SciKit-Learn implementations of decision tree and Multinomial Naive Bayes.

In [5]:
##### ACCURACY CHECK - TRAIN ON TRAINING SET, VALIDATE ON VALIDATION SET #####
d_tree_val = DecisionTreeClassifier().fit(X_train, y_train)
d_tree_score = d_tree_val.score(X_val, y_val)
print("Decision tree validation score: " + str(d_tree_score))

nb_val = MultinomialNB().fit(X_train, y_train)
nb_score = nb_val.score(X_val, y_val)
print("Naive Bayes validation score: " + str(nb_score))

rf_val = RandomForestClassifier(n_estimators=50).fit(X_train, y_train)
rf_score = rf_val.score(X_val, y_val)
print("Random Forest validation score: " + str(rf_score))


##### PREDICTIONS - TRAINING ON FULL TRAINING SET, MAKE PREDICTIONS ON TEST SET #####
d_tree = DecisionTreeClassifier(random_state=0).fit(full_matrix_train, comment_data['subreddits'])
d_tree_predict = d_tree.predict(full_matrix_test)
print("Decision Tree test set predictions:")
print(d_tree_predict)

nb = MultinomialNB().fit(full_matrix_train, comment_data['subreddits'])
nb_predict = nb.predict(full_matrix_test)
print("Naive Bayes test set predictions:")
print(nb_predict)

rf = RandomForestClassifier().fit(full_matrix_train, comment_data['subreddits'])
rf_predict = rf.predict(full_matrix_test)
print("Random Forest test set predictions:")
print(rf_predict)


# check accuracy using same training set values (overfitting, this is just a quick-and-dirty check)
sanity_score = d_tree.score(full_matrix_train, comment_data['subreddits'])
print("Sanity check - decision tree accuracy on training set is: " + str(sanity_score))

Decision tree validation score: 0.2658
Naive Bayes validation score: 0.05753333333333333
Random Forest validation score: 0.42793333333333333
Decision Tree test set predictions:
['nfl' 'europe' 'gameofthrones' ... 'GlobalOffensive' 'gameofthrones'
 'wow']
Naive Bayes test set predictions:
['Music' 'Music' 'anime' ... 'Music' 'Music' 'Music']




Random Forest test set predictions:
['baseball' 'GlobalOffensive' 'AskReddit' ... 'Overwatch' 'gameofthrones'
 'wow']
Sanity check - decision tree accuracy on training set is: 0.9997285714285714


#### File Export
Dump the vectorizer, feature matrices, and dataframes to a pickle file in order for other group members to use them.

In [14]:
import pickle

# vectorizer
with open('../Data/vectorizer.pk', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)

# feature matrices (load on the other side with scipy.sparse.load_npz())
scipy.sparse.save_npz('feature_matrix_train.npz', full_matrix_train)
scipy.sparse.save_npz('feature_matrix_test.npz', full_matrix_test)

# dump predictions to file
with open('predictions.csv', mode='w') as file:
    writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

    writer.writerow(d_tree_predict)
    writer.writerow(nb_predict)
    writer.writerow(rf_predict)

### Feature Selection
These are potential approaches to the Feature Selection section of the preprocessing section above.

In [7]:
# # perform PCA on the TF-IDF matrix
# from sklearn.decomposition import TruncatedSVD

# tsvd = TruncatedSVD(n_components=20)
# reduced_features = tsvd.fit_transform(tfidf)
# print(tsvd.explained_variance_ratio_)

In [8]:
# from sklearn.feature_selection import chi2

# N = 2
# for Product, category_id in sorted(category_to_id.items()):
#   features_chi2 = chi2(tfidf, labels == category_id)
#   indices = np.argsort(features_chi2[0])
#   feature_names = np.array(tfidf.get_feature_names())[indices]
#   unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
#   bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
#   print("# '{}':".format(Product))
#   print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
#   print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

### One-Hot Encoding of Classifiers
To be used for the models implemented from scratch.

In [9]:
# categories
labels = ['hockey', 'nba', 'leagueoflegends', 'soccer', \
          'funny', 'movies', 'anime', 'Overwatch' 'trees', \
          'GlobalOffensive', 'nfl', 'AskReddit', 'gameofthrones', \
          'conspiracy', 'worldnews', 'wow', 'europe', 'canada', \
          'Music', 'baseball']


# default setup
full_reference = np.eye(len(labels))
partial_reference = np.append(np.zeros((1,len(labels)-1), dtype=np.int8),\
                              np.eye(len(labels)-1, dtype=np.int8), axis = 0)

# encoder
def encode(label, labels=labels, ref=full_reference):
    location = labels.index(label)
    return ref[location]


# shapes
print(full_reference.shape)
print(partial_reference.shape)

# example
print(encode('hockey'))

(19, 19)
(19, 18)
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
