# Analyzing Sentiment Causation

In [25]:
import pandas as pd
import numpy as np
import nltk
import handmade.text_normalizer as tn
import handmade.model_evaluation_utils as meu
import handmade.pickle_jar as pj

# import data
dataset = pd.read_csv('data/movie_reviews.csv')
reviews = np.array(dataset['review'])
sentiments = np.array(dataset['sentiment'])

train_reviews = reviews[:35000]
train_sentiments = sentiments[:35000]
test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]

stop_words = nltk.corpus.stopwords.words('english')
stop_words.remove('no')
stop_words.remove('but')
stop_words.remove('not')

# using pickle objects
norm_train_reviews = pj.norm_train_reviews(pickleload=True)

In [23]:
## Interpreting Predictive Models
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

# build BOW features on train reviews
cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0, ngram_range=(1,2))
cv_train_features = cv.fit_transform(norm_train_reviews)
# build Logistic Regression model
lr = LogisticRegression()
lr.fit(cv_train_features, train_sentiments)

# Build Text Classification Pipeline
lr_pipeline = make_pipeline(cv, lr)

# save the list of prediction classes (positive, negative)
classes = list(lr_pipeline.classes_)

In [26]:
# normalize sample movie reviews
new_corpus = ['The Lord of the Rings is an Excellent movie!',
              'I didn\'t like the recent movie on TV. It was NOT good and a waste of time!']
norm_new_corpus = tn.normalize_corpus(new_corpus, stopwords=stop_words)
norm_new_corpus

['lord rings excellent movie', 'not like recent movie tv not good waste time']

In [27]:
# predict movie review sentiment
lr_pipeline.predict(norm_new_corpus)

array(['positive', 'negative'], dtype=object)

In [28]:
# use model prediction class probabilities as a measure of confidence
pd.DataFrame(lr_pipeline.predict_proba(norm_new_corpus), columns=classes)

Unnamed: 0,negative,positive
0,0.195404,0.804596
1,0.863934,0.136066


In [1]:
# use skater framework to easily interpret model decisions
# define helper function

from skater.core.local_interpretation.lime.lime_text import LimeTextExplainer

explainer = LimeTextExplainer(class_names=classes)
# helper function for model interpretation
def interpret_classification_model_prediction(doc_index, norm_corpus, corpus, prediction_labels, explainer_obj):
    # display model prediction and actual sentiments
    print("Test document index: {index}\nActual sentiment: {actual} \nPredicted sentiment: {predicted}".format(index=doc_index, actual=prediction_labels[doc_index], predicted=lr_pipeline.predict([norm_corpus[doc_index]])))
    # display actual review content
    print("\nReview:", corpus[doc_index])
    # display prediction probabilities
    print("\nModel Prediction Probabilities:")
    for probs in zip(classes, lr_pipeline.predict_proba([norm_corpus[doc_index]])[0]):
        print(probs)
    # display model prediction interpretation
    exp = explainer.explain_instance(norm_corpus[doc_index], lr_pipeline.predict_proba, num_features=10, labels=[1])
    exp.show_in_notebook()

ModuleNotFoundError: No module named 'skater'