In [1]:
from Corpora import MovieReviewCorpus
from Lexicon import SentimentLexicon
from Statistics import SignTest
from Classifiers import NaiveBayesText, SVMText
#from Extensions import SVMDoc2Vec

In [2]:
# pip install scipy
# pip install gensim <--issue and doesn't work on ipython?

In [3]:
# retrieve corpus
corpus=MovieReviewCorpus(stemming=False,pos=False)

# use sign test for all significance testing
signTest=SignTest()

# location of svmlight binaries 
# TODO: change this to your local installation
svmlight_dir="/path/to/svmlight/binaries/"

print("--- classifying reviews using sentiment lexicon  ---")

# read in lexicon
lexicon=SentimentLexicon()

# on average there are more positive than negative words per review (~7.13 more positive than negative per review)
# to take this bias into account will use threshold (roughly the bias itself) to make it harder to classify as positive
threshold=8

# question 0.1
lexicon.classify(corpus.reviews,threshold,magnitude=False)
token_preds=lexicon.predictions
print(f"token-only results: {lexicon.getAccuracy():.2f}")

lexicon.classify(corpus.reviews,threshold,magnitude=True)
magnitude_preds=lexicon.predictions
print(f"magnitude results:{lexicon.getAccuracy():.2f}")

# question 0.2
p_value=signTest.getSignificance(token_preds,magnitude_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"magnitude lexicon results are {significance} with respect to token-only")


--- classifying reviews using sentiment lexicon  ---
token-only results: 0.68
magnitude results:0.68
magnitude lexicon results are not significant with respect to token-only


In [4]:
# question 1.0
print("--- classifying reviews using Naive Bayes on held-out test set ---")
NB=NaiveBayesText(smoothing=False,bigrams=False,trigrams=False,discard_closed_class=False)
NB.train(corpus.train)
NB.test(corpus.test)
# store predictions from classifier
non_smoothed_preds=NB.predictions
print(f"Accuracy without smoothing: {NB.getAccuracy():.2f}")

# question 2.0
# use smoothing
NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False)
NB.train(corpus.train)
NB.test(corpus.test)
smoothed_preds=NB.predictions
# saving this for use later
num_non_stemmed_features=len(NB.vocabulary)
print(f"Accuracy using smoothing: {NB.getAccuracy():.2f}")

# question 2.1
# see if smoothing significantly improves results
p_value=signTest.getSignificance(non_smoothed_preds,smoothed_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"results using smoothing are {significance} with respect to no smoothing")

--- classifying reviews using Naive Bayes on held-out test set ---


  neg_score += np.log(self.condProb['NEG'][word_id])
  pos_score += np.log(self.condProb['POS'][word_id])


Accuracy without smoothing: 0.51
Accuracy using smoothing: 0.82
results using smoothing are significant with respect to no smoothing


In [5]:
# question 3.0
NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False)
print("--- classifying reviews using 10-fold cross-evaluation ---")
# using previous instantiated object
NB.crossValidate(corpus)
# using cross-eval for smoothed predictions from now on
smoothed_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.3f}")
print(f"Std. Dev: {NB.getStdDeviation()}")


--- classifying reviews using 10-fold cross-evaluation ---
Accuracy: 0.813
Std. Dev: 0.02590366769397721


In [6]:
# question 4.0
print("--- stemming corpus ---")
# retrieve corpus with tokenized text and stemming (using porter)
stemmed_corpus=MovieReviewCorpus(stemming=True,pos=False)
print("--- cross-validating NB using stemming ---")
NB.crossValidate(stemmed_corpus)
stemmed_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.3f}")
print(f"Std. Dev: {NB.getStdDeviation():.3f}")

# TODO Q4.1
# see if smoothing significantly improves results
p_value=signTest.getSignificance(smoothed_preds,stemmed_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"results using stemming are {significance} with respect to no stemming")

# TODO Q4.2
print("--- determining the number of features before/after stemming ---")
print(f"vocab size baseline: {num_non_stemmed_features}")
print(f"vocab size stemmed: {len(NB.vocabulary)}")



--- stemming corpus ---
--- cross-validating NB using stemming ---
Accuracy: 0.811
Std. Dev: 0.025
results using stemming are not significant with respect to no stemming
--- determining the number of features before/after stemming ---
vocab size baseline: 52550
vocab size stemmed: 32556


In [7]:
# question Q5.0
# cross-validate model using smoothing and bigrams
print("--- cross-validating naive bayes using smoothing and bigrams ---")
NB=NaiveBayesText(smoothing=True,bigrams=True,trigrams=False,discard_closed_class=False)
NB.crossValidate(corpus)
smoothed_and_bigram_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.2f}") 
print(f"Std. Dev: {NB.getStdDeviation():.2f}")

# see if bigrams significantly improves results on smoothed NB only
p_value=signTest.getSignificance(smoothed_preds,smoothed_and_bigram_preds)
signifance = "significant" if p_value < 0.05 else "not significant"
print(f"results using smoothing and bigrams are {signifance} with respect to smoothing only")

# TODO Q5.1
print()
print(f"vocab size baseline: {num_non_stemmed_features}")
print(f"vocab size bigram: {len(NB.vocabulary)}")


--- cross-validating naive bayes using smoothing and bigrams ---
Accuracy: 0.76
Std. Dev: 0.02
results using smoothing and bigrams are significant with respect to smoothing only

vocab size baseline: 52550
vocab size bigram: 500073


In [8]:
# TODO Q6 and 6.1
print("--- classifying reviews using SVM 10-fold cross-eval ---")
corpus=MovieReviewCorpus(stemming=False,pos=False)
SVM=SVMText(bigrams=False,trigrams=False,discard_closed_class=False)

SVM.train(corpus.train)
SVM.test(corpus.test)
#SVM.crossValidate(corpus)
print(f"Accuracy: {NB.getAccuracy():.3f}")
#print(f"Std. Dev: {NB.getStdDeviation():.3f}")


--- classifying reviews using SVM 10-fold cross-eval ---


100%|██████████| 1800/1800 [00:11<00:00, 154.61it/s]


Accuracy: 0.763


In [9]:
# TODO Q7
print("--- adding in POS information to corpus ---")
print("--- training svm on word+pos features ----")
corpus=MovieReviewCorpus(stemming=False,pos=True)
#SVM=SVMText(bigrams=False,trigrams=False,discard_closed_class=False)

#SVM.crossValidate(corpus)
#print(f"Accuracy: {NB.getAccuracy():.3f}")
#print(f"Std. Dev: {NB.getStdDeviation():.3f}")


print("--- training svm discarding closed-class words ---")
SVM=SVMText(bigrams=False,trigrams=False,discard_closed_class=True)

SVM.train(corpus.train)
SVM.test(corpus.test)
#SVM.crossValidate(corpus)
print(f"Accuracy: {SVM.getAccuracy():.3f}")
#print(f"Std. Dev: {NB.getStdDeviation():.3f}")



--- adding in POS information to corpus ---
--- training svm on word+pos features ----
--- training svm discarding closed-class words ---


100%|██████████| 1800/1800 [00:12<00:00, 146.50it/s]


Accuracy: 0.763


In [11]:
print(f"Accuracy: {SVM.getAccuracy():.3f}")



Accuracy: 0.830


In [10]:
# question 8.0
print "--- using document embeddings ---"



SyntaxError: Missing parentheses in call to 'print'. Did you mean print("--- using document embeddings ---")? (<ipython-input-10-a53cf6863dad>, line 2)