# Load Data, Import stuff

In [25]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
import os

% matplotlib inline
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import plotly.io as pio

init_notebook_mode()

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [3]:
import textstat
from nltk.corpus import stopwords

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_validate, KFold
from sklearn.metrics import classification_report, f1_score
from sklearn.pipeline import FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from xgboost.sklearn import XGBClassifier

#To visualize errors:
from prettytable import PrettyTable
from collections import defaultdict

#For topic modeling:
from gensim.models.wrappers.ldamallet import LdaMallet
from gensim.models import LdaModel, LsiModel
from gensim.corpora import Dictionary
mallet_path = './mallet-2.0.8/bin/mallet'
import re


invalid escape sequence \c


invalid escape sequence \s


detected Windows; aliasing chunkize to chunkize_serial


`scipy.sparse.sparsetools` is deprecated!
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.



In [4]:
train_file, test_file = './sampleData/sample_reviews_train.csv', './sampleData/sample_reviews_test.csv'
reviews_train = pd.read_csv(train_file)

## Classification

In [46]:
def run_classification(reviews_df, models, create_vectorizer, num_train=None, num_test=None, weight_fn=None):
    
    #Turn all the features into useful values:
    train_df, test_df = train_test_split(reviews_df, test_size=0.2)
    if num_train:
        train_df = train_df.sample(num_train)
    if num_test:
        test_df = test_df.sample(num_test)
    vectorizer = create_vectorizer()
    vectorizer.fit(train_df)
    print('Fit vectorizer')
    train_vectors = vectorizer.transform(train_df)
    test_vectors = vectorizer.transform(test_df)
    print('Created vectors')
    
    #Create structures to print results of classification nicely:
    train_scores, val_scores = defaultdict(list), defaultdict(list)
    model_names = [str(model) for model in models]
    score_tables = {name: PrettyTable() for name in model_names}
    for table in score_tables.values():
        table.field_names = ['Data', 'Train F1', 'Validation F1']
        
    #Here, we need to label each thing as a True or False:
    trainLabels = np.ndarray(shape=(train_df.shape[0], 1), dtype=object)
    testLabels = np.ndarray(shape=(test_df.shape[0], 1), dtype=object)
    
    train_vote_count = np.asarray(train_df['total_votes'])
    test_vote_count = np.asarray(test_df['total_votes'])
    
    #For the train data:
    falseIndices = np.where(train_vote_count == 0)
    trueIndices = np.where(train_vote_count != 0)
    
    np.put(trainLabels, falseIndices, False)
    np.put(trainLabels, trueIndices, True)
    
    #For the test data:
    falseIndices = np.where(test_vote_count == 0)
    trueIndices = np.where(test_vote_count != 0)
    
    np.put(testLabels, falseIndices, False)
    np.put(testLabels, trueIndices, True)
    
    #To prevent unknown type errors:
    trainLabels = trainLabels.astype(bool)
    testLabels = testLabels.astype(bool)
    
    #Now we do the classification:
    #Note that the flatten() was added just to avoid an annoying warning about column vectors
    for model in models:
        if (model == LinearSVC):
            createdModel = model(max_iter=10000)
        else:
            createdModel = model()
            
        if weight_fn:
            createdModel.fit(train_vectors, trainLabels.flatten(), sample_weight=weight_fn(train_df))
        else:
            createdModel.fit(train_vectors, trainLabels.flatten())
        print('Fit model')
    
        #Get error stats:
        #First the predictions:
        train_predictions = createdModel.predict(train_vectors)
        test_predictions = createdModel.predict(test_vectors)

        #Now the actual scores:
        train_f1 = f1_score(trainLabels, train_predictions, pos_label=True)
        val_f1 = f1_score(testLabels, test_predictions, pos_label=False)

        #Now build the tables:
        #name = model.__class__.__name__
        name = str(model)
        score_tables[name].add_row(['Placeholder', f'{train_f1:.3f}', f'{val_f1:.3f}'])
        train_scores[name].append(train_f1)
        val_scores[name].append(val_f1)

    #Give a more general metric:
    for name, table in score_tables.items():
        table.add_row(
            ('Average +/- std.dev',
             f'{np.mean(train_scores[name]):.3f} +/- {np.std(train_scores[name]):.3f}', 
             f'{np.mean(val_scores[name]):.3f} +/- {np.std(val_scores[name]):.3f}'
            ))
    for name, table in score_tables.items():
        print(f"\n{name}\n{'-' * len(name)}")
        print(table)

In [6]:
class PandasCountVectorizer(TransformerMixin):
    # A bag-of-words vectorizer that works with an entire DataFrame as input.
    # This lets me create a combined feature pipeline with multiple feature vectorizers
    # where I don't have to worry about passing different columns to each.

    def __init__(self, *args, **kwargs):
        self.vectorizer = CountVectorizer(*args, **kwargs)

    def fit(self, X, *args, **kwargs):
        self.vectorizer.fit(X['text'].values)
        return self

    def transform(self, X):
        return self.vectorizer.transform(X['text'].values)

In [7]:
class CustomFeatures(TransformerMixin):
    # A feature vectorizer that creates features based on length, readability and review stars.
    # Use this to add other new custom features (like sentiment and user history maybe)
    # Btw, I think sentiment might not help much - intuitively it seems like it'd be highly correlated with 
    # review stars - 5 star rating - likely positive; 1 star rating - likely negative.

    def fit(self, *args, **kwargs):
        return self
    
    def _review_features(self, review):
        return [
            len(review.text),
            review.stars,
            review.coleman_liau_index,
            review.automated_readability_index,
            review.dale_chall_readability_score,
            review.linsear_write_formula,
            review.gunning_fog,
            review.flesch_reading_ease,
        ]

    def transform(self, reviews):
        return np.array([self._review_features(r) for r in reviews.itertuples()])

In [33]:
models = [RandomForestClassifier, LogisticRegression, XGBClassifier, LinearSVC]

In [85]:
# Ordinary Classification with bag of words with basic filtering
bow_vectorizer_fn = lambda: PandasCountVectorizer(min_df=10, max_df=0.75, max_features=1000)

#Now we classify with each model:
run_classification(reviews_train, models, bow_vectorizer_fn)

Fit vectorizer
Created vectors
Fit model
Fit model
Fit model
Fit model

<class 'sklearn.ensemble.forest.RandomForestClassifier'>
--------------------------------------------------------
+---------------------+-----------------+-----------------+
|         Data        |     Train F1    |  Validation F1  |
+---------------------+-----------------+-----------------+
|     Placeholder     |      0.983      |      0.652      |
| Average +/- std.dev | 0.983 +/- 0.000 | 0.652 +/- 0.000 |
+---------------------+-----------------+-----------------+

<class 'sklearn.linear_model.logistic.LogisticRegression'>
----------------------------------------------------------
+---------------------+-----------------+-----------------+
|         Data        |     Train F1    |  Validation F1  |
+---------------------+-----------------+-----------------+
|     Placeholder     |      0.607      |      0.674      |
| Average +/- std.dev | 0.607 +/- 0.000 | 0.674 +/- 0.000 |
+---------------------+------------


Liblinear failed to converge, increase the number of iterations.



### Comments

See here for how I'm calculating baseline F1: https://stats.stackexchange.com/questions/217376/intuition-about-f1-score

So first let's get some metrics/methodology descriptions out of the way: I went with Munmun's suggestion of just predict class based on whether any votes were given to a review. This avoids the whole threshold issue and actually gives about a 40/60 split on the data (40% is classified as "endorsed" or "True" in the case of the above code). For baseline metrics, we can achieve a 0.65 F1 score on the train set and 0.651 F1 score on the validation set, should we predict all reviews to have social endorsement.

Taking those values into consideration, the models we are trying do not do very well. The Random Forest seems to overfit to the training data, as it gets a 0.985 F1 score and a 0.65 F1 score on the validation set, so it did very badly. LogisticRegression gets a validation F1 score of 0.683, which sounds better than the forest, until you realize that its training F1 score is 0.61, so that's actually worse than the F1 baseline. It's a similar story with the XGB Classifier, and SVC is having problems, so...

In any case, the bag of words feature alone does not help and can actually make things worse. It does not bode well for classification unfortunately, but let's keep going and see what we get.

### Add readability scores

In [9]:
readability_fns = [
    textstat.coleman_liau_index,
    textstat.automated_readability_index,
    textstat.dale_chall_readability_score,
    textstat.linsear_write_formula,
    textstat.gunning_fog,
    textstat.flesch_reading_ease,
]
for fn in readability_fns:
    print(fn.__name__)
    reviews_train[fn.__name__] = reviews_train['text'].apply(fn)

coleman_liau_index
automated_readability_index
dale_chall_readability_score
linsear_write_formula
gunning_fog
flesch_reading_ease


In [51]:
reviews_train.to_csv(train_file)

In [10]:
# Classification with bag-of-words + custom features (length, review stars, different readability scores)
bow_readability_vectorizer_fn = lambda: FeatureUnion([
    ('bow', PandasCountVectorizer(min_df=10, max_df=0.75, max_features=1000)),
    ('custom', CustomFeatures()),
])
run_classification(reviews_train, models, bow_readability_vectorizer_fn)

Fit vectorizer
Created vectors
Fit model
Fit model
Fit model
Fit model

<class 'sklearn.ensemble.forest.RandomForestClassifier'>
--------------------------------------------------------
+---------------------+-----------------+-----------------+
|         Data        |     Train F1    |  Validation F1  |
+---------------------+-----------------+-----------------+
|     Placeholder     |      0.983      |      0.649      |
| Average +/- std.dev | 0.983 +/- 0.000 | 0.649 +/- 0.000 |
+---------------------+-----------------+-----------------+

<class 'sklearn.linear_model.logistic.LogisticRegression'>
----------------------------------------------------------
+---------------------+-----------------+-----------------+
|         Data        |     Train F1    |  Validation F1  |
+---------------------+-----------------+-----------------+
|     Placeholder     |      0.615      |      0.684      |
| Average +/- std.dev | 0.615 +/- 0.000 | 0.684 +/- 0.000 |
+---------------------+------------


Liblinear failed to converge, increase the number of iterations.



### Comments

So the logistic regression seemed to get some benefit out of adding these features, but that still didn't really give us much.

### Classification with weights

Next I tried weighted classification - since we have lots of data with near-zero votes and very few with high votes, I decided to add a weight to each training example proportional to its vote count. I experimented with different formulae - nothing helps much.

In [11]:
weight_by_votes = lambda df: 1 + df.total_votes.values/10
run_classification(reviews_train, models, bow_readability_vectorizer_fn, weight_fn=weight_by_votes)

Fit vectorizer
Created vectors
Fit model



Liblinear failed to converge, increase the number of iterations.



Fit model
Fit model
Fit model

<class 'sklearn.ensemble.forest.RandomForestClassifier'>
--------------------------------------------------------
+---------------------+-----------------+-----------------+
|         Data        |     Train F1    |  Validation F1  |
+---------------------+-----------------+-----------------+
|     Placeholder     |      0.982      |      0.658      |
| Average +/- std.dev | 0.982 +/- 0.000 | 0.658 +/- 0.000 |
+---------------------+-----------------+-----------------+

<class 'sklearn.linear_model.logistic.LogisticRegression'>
----------------------------------------------------------
+---------------------+-----------------+-----------------+
|         Data        |     Train F1    |  Validation F1  |
+---------------------+-----------------+-----------------+
|     Placeholder     |      0.649      |      0.648      |
| Average +/- std.dev | 0.649 +/- 0.000 | 0.648 +/- 0.000 |
+---------------------+-----------------+-----------------+

<class 'xgboost

### Comments

Weighting certain samples somehow made things worse, and I'm not sure what that really means.

### Topic Modeling

In [26]:
class Corpus():
    def __init__(self, texts):
        dictionary = Dictionary([self.text_to_tokens(text) for text in texts])
        dictionary.filter_extremes(no_below=5, no_above=0.6)
        self._dictionary = dictionary
        self._texts = texts

    def text_to_tokens(self, text):
        return [w for w in re.split('\W', text) if w]
    
    def doc2bow(self, text):
        return self._dictionary.doc2bow(self.text_to_tokens(text))

    def __iter__(self):
        for text in self._texts:
            yield self.doc2bow(text)

In [41]:
corpus = Corpus(reviews_train.text)

In [42]:
topic_model = LdaModel(corpus, id2word=corpus._dictionary)


divide by zero encountered in log



In [29]:
lsi_model = LsiModel(corpus, id2word=corpus._dictionary)

In [43]:
class TopicModelVectorizer(TransformerMixin):
    def __init__(self, model, corpus):
        self._model = model
        self._corpus = corpus

    def fit(self, *args, **kwargs):
        return self
    
    def doc_topic_vector(self, doc):
        topic_scores = self._model[self._corpus.doc2bow(doc)]
        topic_vector = [0 for _ in range(self._model.num_topics)]
        for topic_id, score in topic_scores:
            topic_vector[topic_id] = score
        return topic_vector

    def transform(self, docs):
        return np.array([self.doc_topic_vector(doc) for doc in docs])

In [44]:
def create_topic_vectorizer_creator(model, corpus):
    return lambda: TopicModelVectorizer(model, corpus)

In [None]:
#Now we classify with each model using topic modeling:

run_classification(reviews_train, models, create_topic_vectorizer_creator(topic_model, corpus))

### Feature Correlation Analysis

Not sure why this isn't working, but I think it has to do with the fact that the matrix is sparse.
Only 8 features for nearly 40k reviews.

In [65]:
#Referenced https://stackoverflow.com/questions/36108377/how-to-use-the-split-function-on-every-row-in-a-dataframe-in-python
#Referenced https://chrisalbon.com/python/data_wrangling/pandas_apply_operations_to_dataframes/
#Referenced https://seaborn.pydata.org/generated/seaborn.heatmap.html

#We know what vectorizer function has all the features we want, so create those features:
vectorizer = bow_readability_vectorizer_fn()
vectorizer.fit(reviews_train)
print('Fit vectorizer')
review_vectors = vectorizer.transform(reviews_train)
print('Created vectors')

#Now we want to find correlations between these features:
#I am ignoring bag of words, as I am not sure how correlation will work there
reviewFeat = review_vectors[:, 1000:1008]
reviewFeat = pd.DataFrame({"Length": list(reviewFeat[:,0]), "Star Count": list(reviewFeat[:,1]),
                          "Coleman-Liau Index": list(reviewFeat[:,2]),
                           "Automated Readability Index": list(reviewFeat[:,3]),
                           "Dale Chall Readability Score": list(reviewFeat[:,4]),
                           "Linesear Write Formula": list(reviewFeat[:,5]),
                           "Gunning-Fog Score": list(reviewFeat[:,6]),
                           "Flesch Reading Ease": list(reviewFeat[:,7])})

reviewFeat = reviewFeat.astype(str)

for col in list(reviewFeat.columns.values):
    splitter = lambda x: x[col].split("\t")[1]
    reviewFeat[col] = reviewFeat.apply(splitter, axis=0)
    
# reviewFeat = reviewFeat.applymap(split("\t")[1])
print(reviewFeat.head())
# reviewFeat = pd.DataFrame(data=reviewFeat.ravel(), index=np.arange(reviewFeat.shape[0]),
#                                                            columns=["Length", "Star Count", "Coleman-Liau Index",
#                                                                     "Automated Readability Index",
#                                                                     "Dale Chall Readability Score",
#                                                                     "Linesear Write Formula",
#                                                                     "Gunning-Fog Score",
#                                                                     "Flesch Reading Ease"])
sns.heatmap(data=reviewFeat.corr())

Fit vectorizer
Created vectors


KeyError: ('Automated Readability Index', 'occurred at index Automated Readability Index')