# Imports

In [None]:
import pandas as pd
import string
import numpy as np
import time

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')
%matplotlib inline

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

# Read in and view data

In [None]:
reddit = pd.read_csv('../../datasets/reddit_train.csv') #read in the dataset
reddit.drop(['num', 'X'], axis = 1, inplace = True) # These columns provide no information for classification so we remove them
reddit['REMOVED'] = reddit['REMOVED'].map({1:'Yes', 0:'No'}) # To make the understanding of the dataset clearer for now

reddit.head() # see the first 5 rows of data

In [None]:
reddit.describe() # stats about the data

In [None]:
reddit.groupby('REMOVED').describe() # stats about the data group by category

# Exploratory analysis

In [None]:
reddit['LENGTH'] = reddit['BODY'].apply(len) # find the length of each comment

In [None]:
reddit.hist(column = 'LENGTH', bins = 100, by = 'REMOVED', sharex = True, figsize = (13,5)) # histogram of num of characters in each comment

From this plot we can see that the length of the comment does not seem to be a good indicator of whether a message was removed or not, this is as expected but it is always worth exploring the data.

In [None]:
def extract_punc(text):
    punc = [char for char in text if char in string.punctuation] # add punctuation to list
    return len(punc)

In [None]:
reddit['NUM_PUNC'] = reddit['BODY'].apply(extract_punc) # apply above function to each row of data
reddit.hist(column = 'NUM_PUNC', bins = 100, by = 'REMOVED', sharex = True, figsize = (13,5)) # histogram of num of punctuation marks

Again this shows that there is little to no difference between the distribution of number of punctuation points used in comments that were removed and not removed meaning it is not a good indicator to add to our classifier.

# Process data

It is important to preprocess your text data into a simpler form, for example removed words that carry no weight, e.g. 'I' or 'an' so as to not drown out the important words that do carry meaning, we will also remove punctuation from our comments, however this could be an important feature in some datasets.

In [None]:
def text_process(text):
    no_punc = [char for char in text if char not in string.punctuation] # non punctuation characters
    no_punc = ''.join(no_punc) # join back together to a single string
    return [word for word in no_punc.split() if word.lower() not in stopwords.words('english')] # remove stopwords

In [None]:
X_train, X_test, y_train, y_test = train_test_split(reddit['BODY'], reddit['REMOVED']) # split data into train and test set

In [None]:
text_pipeline = Pipeline([('bow', CountVectorizer(analyzer=text_process)), # create pipline
                            ('tfidf', TfidfTransformer()), # Term Frequency Inverse Document Frequency
                            ('NaiveBayes', MultinomialNB()) # NaiveBayes algorithm
                        ])

# Train, test  and adapt models
Note: These models may take a while to train, particularly on a slow computer or laptop

In [None]:
remove_analysis = text_pipeline.fit(X_train, y_train) # run the pipeline
pred = remove_analysis.predict(X_test) # predict values based on model

print(accuracy_score(y_test, pred), '\n\n')
print(classification_report(y_test, pred), '\n\n')
print(confusion_matrix(y_test, pred), '\n\n')


We can see that although the accuracy score of this model seemed decent at around 68%, the actual model is terrible, this could be due to the significant class imbalance. Following, we create a model where the classes are evenly balanced

In [None]:
reddit_no = reddit[reddit['REMOVED'] == 'No'] # data from the No class
reddit_yes = reddit[reddit['REMOVED'] == 'Yes'] # data from the Yes class
to_select = min(len(reddit_no), len(reddit_yes)) # length of smallest dataset

reddit = pd.concat([reddit_no.iloc[:to_select,:], reddit_yes.iloc[:to_select,:]]) # combine the datasets so they are balanced


sns.countplot(reddit['REMOVED'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(reddit['BODY'], reddit['REMOVED']) # split training data into train and test

In [None]:
remove_analysis_balanced = text_pipeline.fit(X_train, y_train) # fit pipeline
pred_balanced = remove_analysis_balanced.predict(X_test) # predict values

print(accuracy_score(y_test, pred_balanced), '\n\n')
print(classification_report(y_test, pred_balanced), '\n\n')
print(confusion_matrix(y_test, pred_balanced), '\n\n')

print('Although the accuracy of the model is approximitely equal to that of the previous one, the overall performance metrics are much better as the model predicts more evenly about the classes. This is demonstrative as to why you need to explore your data before blindly fitting a model to it and looking at only the accuracy score as this can paint a very false picture of how good your model is. The phrase 'Garbage in garbage out comes to mind here'.

To make the predictions better, another thing that might work is stemming. This is the process of attempting to reduce a word down to it's base word, e.g. running would become run. Lets try this and see if it has an impact on our predictions.(we will use the same balanced dataset as the previous model)

In [None]:
def text_process_with_stem(text):
    ps = PorterStemmer() # stemmer object
    no_punc = [char for char in text if char not in string.punctuation] # remove punctuation
    no_punc = ''.join(no_punc) # combine to a string
    no_stops = [word for word in no_punc.split() if word.lower() not in stopwords.words('english')] # remove stopwords
    words_ps = [ps.stem(word.lower()) for word in no_stops] # stem words
    return words_ps

In [None]:
text_pipeline_stem = Pipeline([('bow', CountVectorizer(analyzer=text_process_with_stem)), # changed analyzer
                            ('tfidf', TfidfTransformer()),
                            ('NaiveBayes', MultinomialNB())
                        ])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(reddit['BODY'], reddit['REMOVED']) # split training data

In [None]:
remove_analysis_balanced_stem = text_pipeline_stem.fit(X_train, y_train) # fit model
pred_balanced_stem = remove_analysis_balanced_stem.predict(X_test) # predict off of model

print(accuracy_score(y_test, pred_balanced_stem), '\n\n')
print(classification_report(y_test, pred_balanced_stem), '\n\n')
print(confusion_matrix(y_test, pred_balanced_stem), '\n\n')

This model performs very simmilar to the previous model, it is possibly marginally better but this will likely depend on how the data is split. Instead of stemming, we could try Lemmatizing the words.

In [None]:
def text_process_with_lemma(text):
    lemma = WordNetLemmatizer()
    no_punc = [char for char in text if char not in string.punctuation]
    no_punc = ''.join(no_punc)
    no_stops = [word for word in no_punc.split() if word.lower() not in stopwords.words('english')]
    words_lemma = [lemma.lemmatize(word.lower()) for word in no_stops] # Lemmatize words
    return words_lemma

In [None]:
text_pipeline_lemma = Pipeline([('bow', CountVectorizer(analyzer=text_process_with_lemma)), # changed analyzer
                            ('tfidf', TfidfTransformer()),
                            ('NaiveBayes', MultinomialNB())
                        ])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(reddit['BODY'], reddit['REMOVED']) # split data

In [None]:
remove_analysis_balanced_lemma = text_pipeline_lemma.fit(X_train, y_train) # fit model
pred_balanced_lemma = remove_analysis_balanced_lemma.predict(X_test) # predict unseen instances

print(accuracy_score(y_test, pred_balanced_stem), '\n\n')
print(classification_report(y_test, pred_balanced_stem), '\n\n')
print(confusion_matrix(y_test, pred_balanced_stem), '\n\n')

This model appears to be no better than chance at predicting whether a comment should be removed, therefore we abandon this approach.