# Text Preprocessing for Reddit Comment Classification

In [93]:
# basic
import numpy as np
import scipy
import pandas as pd
import nltk
import string
import csv
import pickle
from spellchecker import SpellChecker
from collections import Counter

# natural language toolkit
from nltk.tokenize import TweetTokenizer
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer

# SciKit-Learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2

# meta-feature extraction
from pymfe.mfe import MFE

# import data
comment_data = pd.read_csv('../Data/reddit_train.csv')
test_data = pd.read_csv('../Data/reddit_test.csv')
print(comment_data.head())
print(test_data.head())

   id                                           comments       subreddits
0   0  Honestly, Buffalo is the correct answer. I rem...           hockey
1   1  Ah yes way could have been :( remember when he...              nba
2   2  https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...  leagueoflegends
3   3  He wouldn't have been a bad signing if we woul...           soccer
4   4  Easy. You use the piss and dry technique. Let ...            funny
   id                                           comments
0   0  Trout and Bryant have both led the league in s...
1   1  &gt; Just like Estonians have good reasons to ...
2   2  Will Sol_Primeval sotp being oblivious?\n\nfin...
3   3  Moving Ostwald borders back to the pre 1967 bo...
4   4         You have to take it out of the bag, Morty!


### Pre-Processing Functions
#### TF-IDF Vectorizer and Feature Selector
Using only one vectorizer and selector for all preprocessing calls means that each feature matrix has the same number of features (which is required for the inputs of the classifier algorithms to match).

In [87]:
def get_vectorizer(train_data, test_data):
    tfidf_vectorizer = TfidfVectorizer(tokenizer=tt.tokenize, stop_words="english", ngram_range=(1,2), min_df=2)
    return tfidf_vectorizer.fit(pd.concat([train_data['prep'], test_data['prep']]))

def get_selector(feature_map, classifiers):
    return SelectPercentile(chi2, percentile=25).fit(feature_map, classifiers)

#### Text Pre-Processing
Input: Pandas dataframe with a column called "comments" containing comments and a column called "pos_tag" with the words of those comments tagged with the corresponding part of speech

Output: sparse matrix of features

In [88]:
# helper function for spelling correction
tt = TweetTokenizer()
spell = SpellChecker(distance=1)  # set distance to 1 instead of the default 2 to speed things up
def spellcheck_col(row):
    return " ".join([spell.correction(word) for word in tt.tokenize(row)])

# helper function for lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatize_col(row):
    return " ".join([lemmatizer.lemmatize(w) for w in tt.tokenize(row)])

# helper function for part of speech tagging
def pos_col(row):
    return pos_tag(tt.tokenize(row))

# helper function for average word count
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

# stopwords for stopword count
stopwords = nltk.corpus.stopwords.words('english')

# object for meta-feature extraction
mfe = MFE(groups=["general", "statistical", "info-theory"], summary=["min", "median", "max"])

def text_cleanup(data):
    ##### CLEANUP OF INPUT DATA #####
    # punctuation removal
    data['prep'] = data['comments'].str.replace(r'[^\w\s]+', '')

    # lowercase
    data['prep'] = data['prep'].str.lower()

    # convert numbers to 'num'
    data['prep'] = data['prep'].str.replace('(\d+)', ' num ')

    # replace links with 'wasurl'
    data['prep'] = data['prep'].str.replace(r'http(?<=http).*', ' wasurl ')

    # replace newlines and tabs with spaces
    data['prep'] = data['prep'].str.replace(r'\s+', " ")

    # fix any double spaces we created in the previous steps
    data['prep'] = data['prep'].str.replace(" +", " ")
    print("Superficial standardization complete")

    # typo correction
    data['prep'] = data.prep.apply(spellcheck_col)
    print("Typo correction complete")

    # lemmatization
    data['prep'] = data.prep.apply(lemmatize_col)
    print("Lemmatization complete")
    
    # part-of-speech extraction
    data['pos_tag'] = data.prep.apply(pos_col)
    print("POS extraction complete")
    
    return data

def preprocess(inFrame):
    data = inFrame.copy(deep=True)
    
    ##### META-FEATURE EXTRACTION #####
    # word count
    data['word_count'] = data['comments'].apply(lambda x: len(str(x).split(" ")))
    wc = scipy.sparse.csr_matrix.transpose(scipy.sparse.csr_matrix(data['word_count'].to_numpy()))

    # character count
    data['char_count'] = data['comments'].str.len()
    # cc = scipy.sparse.csr_matrix.transpose(scipy.sparse.csr_matrix(data['char_count'].to_numpy()))
    # TODO fix issue that is including NaNs in the char_count list

    # average word length
    data['avg_word'] = data['comments'].apply(lambda x: avg_word(x))
    aw = scipy.sparse.csr_matrix.transpose(scipy.sparse.csr_matrix(data['avg_word'].to_numpy()))

    # stopword count (stopwords will be removed later)
    data['stop_count'] = data['comments'].apply(lambda x: len([x for x in x.split() if x in stopwords]))
    sc = scipy.sparse.csr_matrix.transpose(scipy.sparse.csr_matrix(data['stop_count'].to_numpy()))

    # digit count
    data['digit_count'] = data['comments'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
    dc = scipy.sparse.csr_matrix.transpose(scipy.sparse.csr_matrix(data['digit_count'].to_numpy()))

    
    ##### PART-OF-SPEECH COUNTING #####
    # all the POS tags in NLTK's system
    pos_tags = ["CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNS", "NNP", "NNPS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB"]
    
    # add a column in the Pandas DataFrame for each POS tag
    for tag in pos_tags:
        data[tag] = 0.0
        
    # iterate through the column that contains tagged comments
    for idx, row in enumerate(data['pos_tag']):
        # get the number of words in the column in this row
        word_count = data['word_count'][idx]
        
        # count the number of each tag present in this comment
        counts = Counter([j for i,j in row])
        
        # iterate through all of NLTK's tags
        for tag in pos_tags:
            # find the ratio of this POS tag : total words in this comment
            pos_per_word = counts[tag] / word_count

            # place the ratio in the dataframe column for this tag, at the row of this comment
            data.at[idx, tag] = pos_per_word
    
    # create a sparse matrix out of the count data
    pos = scipy.sparse.csr_matrix.transpose(scipy.sparse.csr_matrix(data[pos_tags.pop()].to_numpy()))
    for tag in pos_tags:
        new_pos = scipy.sparse.csr_matrix.transpose(scipy.sparse.csr_matrix(data[tag].to_numpy()))
        pos = scipy.sparse.hstack((pos, new_pos))
    
    
    ##### TF-IDF #####
    tfidf = tfidf_vectorizer.transform(data.prep)
    
    
    ##### FEATURE COMBINATION #####
    feature_matrix = scipy.sparse.hstack((tfidf, wc, aw, sc, dc, pos))
    
    
    return feature_matrix

### Training and Test Data Split

In [89]:
# clean up comments
comment_data = text_cleanup(comment_data)
print(comment_data.head())
test_data = text_cleanup(test_data)
print(test_data.head())

# get tfidf vectorizer
tfidf_vectorizer = get_vectorizer(comment_data, test_data)

# preprocess training set
full_matrix_train = preprocess(comment_data)
print("Full matrix shape:")
print(full_matrix_train.shape)
full_matrix_test = preprocess(test_data)
print("Test matrix shape:")
print(full_matrix_test.shape)

# reduce feature space
feature_selector = get_selector(full_matrix_train, comment_data['subreddits'])
reduced_feature_train = feature_selector.transform(full_matrix_train)
print("Reduced matrix shape:")
print(reduced_feature_train.shape)
reduced_feature_test = feature_selector.transform(full_matrix_test)
print("Reduced test matrix shape:")
print(reduced_feature_test.shape)

Superficial standardization complete
Typo correction complete
Lemmatization complete
POS extraction complete
   id                                           comments       subreddits  \
0   0  Honestly, Buffalo is the correct answer. I rem...           hockey   
1   1  Ah yes way could have been :( remember when he...              nba   
2   2  https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...  leagueoflegends   
3   3  He wouldn't have been a bad signing if we woul...           soccer   
4   4  Easy. You use the piss and dry technique. Let ...            funny   

                                                prep  \
0  honestly buffalo is the correct answer i remem...   
1  ah yes way could have been remember when he wa...   
2  wasurl if you dint find it already nothing out...   
3  he couldnt have been a bad signing if we could...   
4  easy you use the piss and dry technique let a ...   

                                             pos_tag  
0  [(honestly, RB), (buffalo, NN), (

### Models
SciKit-Learn implementations of classification models.

#### K-Fold Validation

In [91]:
X = reduced_feature_train
y = comment_data['subreddits']

clf1 = LogisticRegression(solver='lbfgs', multi_class='multinomial', random_state=1, max_iter=1000)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = MultinomialNB()
clf4 = DecisionTreeClassifier()
clf5 = KNeighborsClassifier(n_neighbors=7)
clf6 = SVC(gamma='scale', kernel='rbf', probability=True)
accuracies = [0,0,0,0,0,0]

In [94]:
scores = cross_val_score(clf2, X, y, cv=5, scoring='accuracy')
accuracies[1] = scores.mean()
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), "Random Forest"))

# dump to pickle file
with open('train_randomforest.pk', 'wb') as file:
    pickle.dump(clf2, file)

Accuracy: 0.40 (+/- 0.01) [Random Forest]


In [95]:
scores = cross_val_score(clf3, X, y, cv=5, scoring='accuracy')
accuracies[2] = scores.mean()
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), "Multinomial Naive Bayes"))

# dump to pickle file
with open('train_MNBayes.pk', 'wb') as file:
    pickle.dump(clf3, file)

Accuracy: 0.23 (+/- 0.00) [Multinomial Naive Bayes]


In [96]:
scores = cross_val_score(clf4, X, y, cv=5, scoring='accuracy')
accuracies[3] = scores.mean()
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), "Decision Tree"))

# dump to pickle file
with open('train_DecisionTree.pk', 'wb') as file:
    pickle.dump(clf4, file)

Accuracy: 0.25 (+/- 0.00) [Decision Tree]


In [97]:
scores = cross_val_score(clf5, X, y, cv=5, scoring='accuracy')
accuracies[4] = scores.mean()
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), "K-Nearest Neighbours"))

# dump to pickle file
with open('train_KNearest.pk', 'wb') as file:
    pickle.dump(clf5, file)

Accuracy: 0.07 (+/- 0.00) [K-Nearest Neighbours]


In [None]:
scores = cross_val_score(clf6, X, y, cv=5, scoring='accuracy')
accuracies[5] = scores.mean()
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), "SVC"))

# dump to pickle file
with open('train_SVC.pk', 'wb') as file:
    pickle.dump(clf6, file)

In [None]:
scores = cross_val_score(clf1, X, y, cv=5, scoring='accuracy')
accuracies[0] = scores.mean()
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), "Logistic Regression"))

# dump to pickle file
with open('train_logisticReg.pk', 'wb') as file:
    pickle.dump(clf1, file)

In [None]:
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('mnb', clf3), ('dtc', clf4), ('knn', clf5), ('svc', clf6)], weights=accuracies)
scores = cross_val_score(eclf, X, y, cv=5, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), "Voting Classifier"))

# dump to pickle file
with open('train_votingclass.pk', 'wb') as file:
    pickle.dump(eclf, file)

#### Predictions

In [None]:
predictions = eclf.predict(reduced_feature_test)

# dump predictions to file
with open('predictions.csv', mode='w') as file:
    writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

    writer.writerow(predictions)

### File Export
Dump the vectorizer, selector, feature matrices, and dataframes to a files in order for other group members to use them.

In [None]:
# vectorizer
with open('vectorizer.pk', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)
    
# selector
with open('selector.pk', 'wb') as file:
    pickle.dump(feature_selector, file)

# feature matrices (load on the other side with scipy.sparse.load_npz())
scipy.sparse.save_npz('feature_matrix_train.npz', full_matrix_train)
scipy.sparse.save_npz('feature_matrix_test.npz', full_matrix_test)
scipy.sparse.save_npz('reduced_matrix_train.npz', full_matrix_train)
scipy.sparse.save_npz('reduced_matrix_test.npz', full_matrix_test)

### One-Hot Encoding of Classifiers
To be used for the models implemented from scratch.

In [9]:
# categories
labels = ['hockey', 'nba', 'leagueoflegends', 'soccer', \
          'funny', 'movies', 'anime', 'Overwatch' 'trees', \
          'GlobalOffensive', 'nfl', 'AskReddit', 'gameofthrones', \
          'conspiracy', 'worldnews', 'wow', 'europe', 'canada', \
          'Music', 'baseball']


# default setup
full_reference = np.eye(len(labels))
partial_reference = np.append(np.zeros((1,len(labels)-1), dtype=np.int8),\
                              np.eye(len(labels)-1, dtype=np.int8), axis = 0)

# encoder
def encode(label, labels=labels, ref=full_reference):
    location = labels.index(label)
    return ref[location]


# shapes
print(full_reference.shape)
print(partial_reference.shape)

# example
print(encode('hockey'))

(19, 19)
(19, 18)
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
