In [1]:
import aux_functions as aux
import bayes
import rule_based
from rule_based_new import RuleBasedSentimentAnalyser


In [2]:
N = 1  # make n-grams (1 for single word, 2 for bigrams, etc.)
PRINT_ERRORS = False
conf = dict(n=N, print_errors=PRINT_ERRORS)

In [3]:
# initialise datasets and dictionaries
sentimentDictionary, sentencesTrain, sentencesTest, sentencesNokia = aux.read_files()
sentencesFilms = {**sentencesTrain, **sentencesTest}  # merge the two dictionaries

# build conditional probabilities using training data
pWords = bayes.trainBayes(sentencesTrain, n=N)



## Run Naive Bayes classifier on datasets

In [4]:
bayes.testBayes(sentencesTrain,  "Films (Train Data, Naive Bayes)\t", *pWords, 0.5, **conf)


Films (Train Data, Naive Bayes)	 Accuracy (All)=0.89 (8559/9605)

Films (Train Data, Naive Bayes)	 Precision (Pos)=0.90 (4236/4730)
Films (Train Data, Naive Bayes)	 Recall (Pos)=0.88 (4236/4788)
Films (Train Data, Naive Bayes)	 F-measure (Pos)=0.89
Films (Train Data, Naive Bayes)	 Precision (Neg)=0.89 (4323/4875)
Films (Train Data, Naive Bayes)	 Recall (Neg)=0.90 (4323/4817)
Films (Train Data, Naive Bayes)	 F-measure (Neg)=0.89


In [5]:
bayes.testBayes(sentencesTest,  "Films  (Test Data, Naive Bayes)\t", *pWords, 0.5, **conf)


Films  (Test Data, Naive Bayes)	 Accuracy (All)=0.78 (827/1059)

Films  (Test Data, Naive Bayes)	 Precision (Pos)=0.81 (408/504)
Films  (Test Data, Naive Bayes)	 Recall (Pos)=0.75 (408/544)
Films  (Test Data, Naive Bayes)	 F-measure (Pos)=0.78
Films  (Test Data, Naive Bayes)	 Precision (Neg)=0.76 (419/555)
Films  (Test Data, Naive Bayes)	 Recall (Neg)=0.81 (419/515)
Films  (Test Data, Naive Bayes)	 F-measure (Neg)=0.78


In [6]:
bayes.testBayes(sentencesNokia, "Nokia   (All Data,  Naive Bayes)\t", *pWords, 0.7, **conf)


Nokia   (All Data,  Naive Bayes)	 Accuracy (All)=0.58 (154/266)

Nokia   (All Data,  Naive Bayes)	 Precision (Pos)=0.77 (105/136)
Nokia   (All Data,  Naive Bayes)	 Recall (Pos)=0.57 (105/186)
Nokia   (All Data,  Naive Bayes)	 F-measure (Pos)=0.65
Nokia   (All Data,  Naive Bayes)	 Precision (Neg)=0.38 (49/130)
Nokia   (All Data,  Naive Bayes)	 Recall (Neg)=0.62 (49/80)
Nokia   (All Data,  Naive Bayes)	 F-measure (Neg)=0.47


## Run sentiment dictionary based classifier on datasets

In [7]:
rule_based.testDictionary(sentencesTrain,  "Films (Train Data, Rule-Based)\t", sentimentDictionary, 0, PRINT_ERRORS)


Films (Train Data, Rule-Based)	 Accuracy (All)=0.62 (5980/9605)

Films (Train Data, Rule-Based)	 Precision (Pos)=0.59 (3855/6547)
Films (Train Data, Rule-Based)	 Recall (Pos)=0.81 (3855/4788)
Films (Train Data, Rule-Based)	 F-measure (Pos)=0.68
Films (Train Data, Rule-Based)	 Precision (Neg)=0.69 (2125/3058)
Films (Train Data, Rule-Based)	 Recall (Neg)=0.44 (2125/4817)
Films (Train Data, Rule-Based)	 F-measure (Neg)=0.54


In [8]:
rule_based.testDictionary(sentencesTest,  "Films  (Test Data, Rule-Based)\t",  sentimentDictionary, 0, PRINT_ERRORS)


Films  (Test Data, Rule-Based)	 Accuracy (All)=0.66 (698/1059)

Films  (Test Data, Rule-Based)	 Precision (Pos)=0.63 (447/711)
Films  (Test Data, Rule-Based)	 Recall (Pos)=0.82 (447/544)
Films  (Test Data, Rule-Based)	 F-measure (Pos)=0.71
Films  (Test Data, Rule-Based)	 Precision (Neg)=0.72 (251/348)
Films  (Test Data, Rule-Based)	 Recall (Neg)=0.49 (251/515)
Films  (Test Data, Rule-Based)	 F-measure (Neg)=0.58


In [9]:
rule_based.testDictionary(sentencesNokia, "Nokia   (All Data, Rule-Based)\t",  sentimentDictionary, 0, PRINT_ERRORS)


Nokia   (All Data, Rule-Based)	 Accuracy (All)=0.80 (213/266)

Nokia   (All Data, Rule-Based)	 Precision (Pos)=0.80 (178/223)
Nokia   (All Data, Rule-Based)	 Recall (Pos)=0.96 (178/186)
Nokia   (All Data, Rule-Based)	 F-measure (Pos)=0.87
Nokia   (All Data, Rule-Based)	 Precision (Neg)=0.82 (35/43)
Nokia   (All Data, Rule-Based)	 Recall (Neg)=0.44 (35/80)
Nokia   (All Data, Rule-Based)	 F-measure (Neg)=0.58


## Useful words
print most useful words

In [10]:
useful = aux.mostUseful(*pWords, 50)


In [11]:
print(useful['NEGATIVE'])

['mediocre', 'generic', 'badly', 'unfunny', 'routine', 'lame', 'poorly', 'mindless', 'boring', 'bore', 'disguise', 'stale', 'tiresome', 'pointless', 'offensive', 'superficial', 'shoot', 'meandering', 'annoying', 'thinks', 'product', 'stupid', 'unless', 'animal', 'horrible', 'chan', 'wasted', 'pinocchio', 'junk', 'banal', 'harvard', 'fatal', 'sadly', 'incoherent', 'lifeless', 'seagal', 'supposed', 'waste', 'dull', 'cliched', 'inept', 'collection', 'sentiment', 'amateurish', 'meant', 'kung', 'pathetic', 'trite', 'missed', 'pile']


In [12]:
print(useful['POSITIVE'])

['understands', 'timely', 'poem', 'unflinching', 'breathtaking', 'visceral', 'ingenious', 'captivating', 'hopeful', 'poignant', 'startling', 'iranian', 'powerful', 'format', 'heartbreaking', 'grown', 'jealousy', 'transcends', 'literary', 'subversive', 'spare', 'unexpected', 'provides', 'resonant', 'tour', 'polished', 'wry', 'vividly', 'chilling', 'captures', 'tender', 'playful', 'respect', 'heartwarming', 'wonderfully', 'detailed', 'pulls', 'lively', 'warm', 'gem', 'mesmerizing', 'realistic', 'refreshing', 'refreshingly', 'haunting', 'riveting', 'intimate', 'inventive', 'wonderful', 'engrossing']


See how many of the words are in the respective parts of the dictionary

In [13]:
dict_positive, dict_negative = (dict(filter(lambda i: i[1]==v, sentimentDictionary.items())) for v in [1, -1])

In [14]:
sum(w in dict_positive.keys() for w in useful['POSITIVE'])  # how many of the positive useful words appear in the dictionary

23

In [15]:
sum(w in dict_negative.keys() for w in useful['NEGATIVE'])  # how many of the negative useful words appear in the dictionary

28

In [16]:
sum(w in sentimentDictionary.keys() for w in useful['POSITIVE'])  # search the entire dictionary

29

In [17]:
sum(w in sentimentDictionary.keys() for w in useful['NEGATIVE'])

28

## Rule-based approach - new implementation

In [18]:
rbsa = RuleBasedSentimentAnalyser(sentimentDictionary, print_errors=PRINT_ERRORS)  # initialise the analyser

### Films dataset
Run the analysis for the entire films set (avoid result variation due to random split of the set)

In [19]:
rbsa.evaluate(sentencesFilms,  "Films (All Data, New Rule-Based)\t")

Films (All Data, New Rule-Based)	 Accuracy (All)=0.63 (6765/10663)

Films (All Data, New Rule-Based)	 Precision (Pos)=0.60 (4216/6998)
Films (All Data, New Rule-Based)	 Recall (Pos)=0.79 (4216/5332)
Films (All Data, New Rule-Based)	 F-measure (Pos)=0.68
Films (All Data, New Rule-Based)	 Precision (Neg)=0.70 (2549/3665)
Films (All Data, New Rule-Based)	 Recall (Neg)=0.48 (2549/5331)
Films (All Data, New Rule-Based)	 F-measure (Neg)=0.57


Compare to the original approach on the entire dataset

In [20]:
rule_based.testDictionary(sentencesTrain,  "Films (All Data, Rule-Based)\t", sentimentDictionary, 0, PRINT_ERRORS)

Films (All Data, Rule-Based)	 Accuracy (All)=0.62 (5980/9605)

Films (All Data, Rule-Based)	 Precision (Pos)=0.59 (3855/6547)
Films (All Data, Rule-Based)	 Recall (Pos)=0.81 (3855/4788)
Films (All Data, Rule-Based)	 F-measure (Pos)=0.68
Films (All Data, Rule-Based)	 Precision (Neg)=0.69 (2125/3058)
Films (All Data, Rule-Based)	 Recall (Neg)=0.44 (2125/4817)
Films (All Data, Rule-Based)	 F-measure (Neg)=0.54


### Nokia dataset

In [21]:
rbsa.evaluate(sentencesNokia, "Nokia   (All Data, New Rule-Based)\t")


Nokia   (All Data, New Rule-Based)	 Accuracy (All)=0.82 (217/266)

Nokia   (All Data, New Rule-Based)	 Precision (Pos)=0.82 (175/213)
Nokia   (All Data, New Rule-Based)	 Recall (Pos)=0.94 (175/186)
Nokia   (All Data, New Rule-Based)	 F-measure (Pos)=0.88
Nokia   (All Data, New Rule-Based)	 Precision (Neg)=0.80 (42/53)
Nokia   (All Data, New Rule-Based)	 Recall (Neg)=0.53 (42/80)
Nokia   (All Data, New Rule-Based)	 F-measure (Neg)=0.64


Compare to the original approach again

In [22]:
rule_based.testDictionary(sentencesNokia, "Nokia   (All Data, Rule-Based)\t",  sentimentDictionary, 0, PRINT_ERRORS)

Nokia   (All Data, Rule-Based)	 Accuracy (All)=0.80 (213/266)

Nokia   (All Data, Rule-Based)	 Precision (Pos)=0.80 (178/223)
Nokia   (All Data, Rule-Based)	 Recall (Pos)=0.96 (178/186)
Nokia   (All Data, Rule-Based)	 F-measure (Pos)=0.87
Nokia   (All Data, Rule-Based)	 Precision (Neg)=0.82 (35/43)
Nokia   (All Data, Rule-Based)	 Recall (Neg)=0.44 (35/80)
Nokia   (All Data, Rule-Based)	 F-measure (Neg)=0.58
