## Sentiment ID

In [23]:
#Inter-rater reliability: It is the degree of agreement among raters. It gives a score of how much homogeneity, 
#or consensus, there is in the ratings given by judges
from sklearn.metrics import cohen_kappa_score
op1 = "Audio from this phone is merely OK and this seems to be a side-effect of having basically no room for speakers"
op2 = "The S Pen is the true headline feature for the Note series and it is pretty much the only flagship around that lets you draw and take notes with a pen on the display."
op3 = "Samsung has finally ditched the headphone jack from the Note’s design, meaning you’ll have to rely on wireless headphones or a pair with a USB-C connection"
r1 = [-1,0,-1]
r2 = [1,0,-1]
r3 = [-1,1,0]
print("Inter-rater reliability bwtween rater 1 and rater 2 is:",cohen_kappa_score(r1, r2))
print("Inter-rater reliability bwtween rater 1 and rater 3 is:",cohen_kappa_score(r1, r3))
print("Inter-rater reliability bwtween rater 2 and rater 3 is:",cohen_kappa_score(r2, r3))

Inter-rater reliability bwtween rater 1 and rater 2 is: 0.5
Inter-rater reliability bwtween rater 1 and rater 3 is: 0.0
Inter-rater reliability bwtween rater 2 and rater 3 is: -0.5


In [24]:
from scipy.stats import pearsonr
r1 = [-1,0,-1]
r2 = [1,0,-1]
r3 = [-1,1,0]
print("Pearsons correlation between rater 1 and rater 2 is:",pearsonr(r1, r2)[0])
print("Pearsons correlation between rater 1 and rater 3 is:",pearsonr(r1, r3)[0])
print("Pearsons correlation between rater 2 and rater 3 is:",pearsonr(r2, r3)[0])

Pearsons correlation between rater 1 and rater 2 is: 0.0
Pearsons correlation between rater 1 and rater 3 is: 0.8660254037844386
Pearsons correlation between rater 2 and rater 3 is: -0.4999999999999999


## Sentiment Analysis

In [26]:
import re, math, collections, itertools, sys, os
import nltk, nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures, scores
from nltk.probability import FreqDist, ConditionalFreqDist

In [103]:
def evaluate_features(feature_select):
    #reading pre-labeled input and splitting into lines
    negSentences = open(r"D:\Study\TextAnalytics\Lecture 9\Content\XLect9.Progs\XLect10.Progs\rt-polarity-neg.txt", 'r', encoding='utf8')
    posSentences = open(r"D:\Study\TextAnalytics\Lecture 9\Content\XLect9.Progs\XLect10.Progs\rt-polarity-pos.txt", 'r', encoding='utf8')
    negSentences = re.split(r'\n', negSentences.read())
    posSentences = re.split(r'\n', posSentences.read())
    
    posFeatures = []
    negFeatures = []

    # breaks up the sentences into lists of individual words
    # creates instance structures for classifier
    for i in posSentences:
        posWords = re.findall(r"[\w']+|[.,!?;]", i)
        posWords = [feature_select(posWords), 'pos']
        posFeatures.append(posWords)
    print(posFeatures)
    
    for i in negSentences:
        negWords = re.findall(r"[\w']+|[.,!?;]", i)
        negWords = [feature_select(negWords), 'neg']
        negFeatures.append(negWords)
    #print(negFeatures)

    posCutoff = int(math.floor(len(posFeatures)*3/4))
    negCutoff = int(math.floor(len(negFeatures)*3/4))
    trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
    testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]
    #print("posFeatures:",len(posFeatures))
    #print("negFeatures:",len(negFeatures))
    #print('posCutoff:',posCutoff)
    #print('negCutoff:',negCutoff)
    #print(trainFeatures)
    
    #Runs the classifier on the testFeatures
    classifier = NaiveBayesClassifier.train(trainFeatures)

    #Sets up labels to look at output
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)
    #print(testFeatures[1]) #Output: [{'directed': True, 'with': True, 'purpose': True, 'and': True, 'finesse': True, 'by': True, "england's": True, 'roger': True, 'mitchell': True, ',': True, 'who': True, 'handily': True, 'makes': True, 'the': True, 'move': True, 'from': True, 'pleasing': True, 'relatively': True, 'lightweight': True, 'commercial': True, 'fare': True, 'such': True, 'as': True, 'notting': True, 'hill': True, 'to': True, 'real': True, 'thematic': True, 'heft': True, '.': True}, 'pos']
    
    for i, (features, label) in enumerate(testFeatures): # enumerate adds number-count to each item
        #print("Labels:",label)
        #print("Features",features)
        referenceSets[label].add(i) #create two dictionary of original data called as, referenceSet['pos']={0,2,3,5.....n} and referenceSet['neg']={1,4,6....n}, the dict values are nothing but a unique identifier for each featutre(tokenized sentence) that are being assigned to that dict.
        #print('referenceSet=',referenceSets[label])
        predicted = classifier.classify(features) # classifiers' proposed polarity for tests
        #print(predicted)
        testSets[predicted].add(i)
        #print(testSets[predicted])

    #Outputs
    print('train on %s instances, test on %s instances'% (len(trainFeatures), len(testFeatures)))
    print('accuracy:', nltk.classify.util.accuracy(classifier, testFeatures))
    print('pos precision:', scores.precision(referenceSets['pos'], testSets['pos']))
    print('pos recall:', scores.recall(referenceSets['pos'], testSets['pos']))
    print('neg precision:', scores.precision(referenceSets['neg'], testSets['neg']))
    print('neg recall:', scores.recall(referenceSets['neg'], testSets['neg']))
    classifier.show_most_informative_features(10)


def make_full_dict(words):
    return dict([(word, True) for word in words])

print('using all words as features')
evaluate_features(make_full_dict)


using all words as features


train on 7998 instances, test on 2666 instances
accuracy: 0.77344336084021
pos precision: 0.7881422924901186
pos recall: 0.7479369842460615
neg precision: 0.7601713062098501
neg recall: 0.7989497374343586
Most Informative Features
              engrossing = True              pos : neg    =     17.0 : 1.0
                   quiet = True              pos : neg    =     15.7 : 1.0
                mediocre = True              neg : pos    =     13.7 : 1.0
               absorbing = True              pos : neg    =     13.0 : 1.0
                portrait = True              pos : neg    =     12.4 : 1.0
               inventive = True              pos : neg    =     12.3 : 1.0
              refreshing = True              pos : neg    =     12.3 : 1.0
                   flaws = True              pos : neg    =     12.3 : 1.0
            refreshingly = True              pos : neg    =     11.7 : 1.0
                 triumph = True              pos : neg    =     11.7 : 1.0


## Removing stop words and Normalizing(lowercase)

In [89]:
from nltk.corpus import stopwords
def evaluate_features(feature_select):
    #reading pre-labeled input and splitting into lines
    negSentences = open(r"D:\Study\TextAnalytics\Lecture 9\Content\XLect9.Progs\XLect10.Progs\rt-polarity-neg.txt", 'r', encoding='utf8')
    posSentences = open(r"D:\Study\TextAnalytics\Lecture 9\Content\XLect9.Progs\XLect10.Progs\rt-polarity-pos.txt", 'r', encoding='utf8')
    negSentences = re.split(r'\n', negSentences.read())
    posSentences = re.split(r'\n', posSentences.read())
    posFeatures = []
    negFeatures = []
    stop_words = stopwords.words('english')
    # breaks up the sentences into lists of individual words
    # creates instance structures for classifier
    for i in posSentences:
        posWords = re.findall(r"[\w']+|[.,!?;]", i)
        for w in posWords: #removing Stop Words
            if w in stop_words:
                posWords.remove(w)
        posWords = [w.lower() for w in posWords]
        posWords = [feature_select(posWords), 'pos']
        posFeatures.append(posWords)   
    for i in negSentences:
        negWords = re.findall(r"[\w']+|[.,!?;]", i)
        for w in negWords: #removing Stop Words
            if w in stop_words:
                negWords.remove(w)
        negWords = [w.lower() for w in negWords]
        negWords = [feature_select(negWords), 'neg']
        negFeatures.append(negWords)
    posCutoff = int(math.floor(len(posFeatures)*3/4))
    negCutoff = int(math.floor(len(negFeatures)*3/4))
    trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
    testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]
    
    #Runs the classifier on the testFeatures
    classifier = NaiveBayesClassifier.train(trainFeatures)

    #Sets up labels to look at output
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)
    
    for i, (features, label) in enumerate(testFeatures): # enumerate adds number-count to each item
        referenceSets[label].add(i) 
        predicted = classifier.classify(features) # classifiers' proposed polarity for tests
        testSets[predicted].add(i)

    #Outputs
    print('train on %s instances, test on %s instances'% (len(trainFeatures), len(testFeatures)))
    print('accuracy:', nltk.classify.util.accuracy(classifier, testFeatures))
    print('pos precision:', scores.precision(referenceSets['pos'], testSets['pos']))
    print('pos recall:', scores.recall(referenceSets['pos'], testSets['pos']))
    print('neg precision:', scores.precision(referenceSets['neg'], testSets['neg']))
    print('neg recall:', scores.recall(referenceSets['neg'], testSets['neg']))
    classifier.show_most_informative_features(10)


def make_full_dict(words):
    return dict([(word, True) for word in words])

print('using all words as features')
evaluate_features(make_full_dict)


using all words as features
train on 7998 instances, test on 2666 instances
accuracy: 0.7678169542385597
pos precision: 0.7721036585365854
pos recall: 0.759939984996249
neg precision: 0.7636632200886263
neg recall: 0.7756939234808702
Most Informative Features
              engrossing = True              pos : neg    =     17.0 : 1.0
                   quiet = True              pos : neg    =     15.7 : 1.0
                mediocre = True              neg : pos    =     13.7 : 1.0
               absorbing = True              pos : neg    =     13.0 : 1.0
                portrait = True              pos : neg    =     12.4 : 1.0
               inventive = True              pos : neg    =     12.3 : 1.0
              refreshing = True              pos : neg    =     12.3 : 1.0
                   flaws = True              pos : neg    =     12.3 : 1.0
            refreshingly = True              pos : neg    =     11.7 : 1.0
                 triumph = True              pos : neg    =     1

## Changing classifier to SVM

In [94]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import LinearSVC
def evaluate_features(feature_select):
    #reading pre-labeled input and splitting into lines
    negSentences = open(r"D:\Study\TextAnalytics\Lecture 9\Content\XLect9.Progs\XLect10.Progs\rt-polarity-neg.txt", 'r', encoding='utf8')
    posSentences = open(r"D:\Study\TextAnalytics\Lecture 9\Content\XLect9.Progs\XLect10.Progs\rt-polarity-pos.txt", 'r', encoding='utf8')
    negSentences = re.split(r'\n', negSentences.read())
    posSentences = re.split(r'\n', posSentences.read())
    
    posFeatures = []
    negFeatures = []
    stop_words = stopwords.words('english')
    
    # breaks up the sentences into lists of individual words
    # creates instance structures for classifier
    for i in posSentences:
        posWords = re.findall(r"[\w']+|[.,!?;]", i)
        '''
        for w in posWords:
            if w in stop_words:
                posWords.remove(w)
        posWords = [w.lower() for w in posWords]
        '''
        posWords = [feature_select(posWords), 'pos']
        posFeatures.append(posWords)
    
    for i in negSentences:
        negWords = re.findall(r"[\w']+|[.,!?;]", i)
        '''
        for w in negWords:
            if w in stop_words:
                negWords.remove(w)
        negWords = [w.lower() for w in negWords]
        '''
        negWords = [feature_select(negWords), 'neg']
        negFeatures.append(negWords)

    posCutoff = int(math.floor(len(posFeatures)*3/4))
    negCutoff = int(math.floor(len(negFeatures)*3/4))
    trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
    testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]
    
    #Runs the classifier on the testFeatures
    classifier = SklearnClassifier(LinearSVC()).train(trainFeatures)

    #Sets up labels to look at output
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)
    
    
    for i, (features, label) in enumerate(testFeatures): # enumerate adds number-count to each item
        referenceSets[label].add(i) 
        predicted = classifier.classify(features) # classifiers' proposed polarity for tests
        testSets[predicted].add(i)

    #Outputs
    print('train on %s instances, test on %s instances'% (len(trainFeatures), len(testFeatures)))
    print('accuracy:', nltk.classify.util.accuracy(classifier, testFeatures))
    print('pos precision:', scores.precision(referenceSets['pos'], testSets['pos']))
    print('pos recall:', scores.recall(referenceSets['pos'], testSets['pos']))
    print('neg precision:', scores.precision(referenceSets['neg'], testSets['neg']))
    print('neg recall:', scores.recall(referenceSets['neg'], testSets['neg']))
    #classifier.show_most_informative_features(10)


def make_full_dict(words):
    return dict([(word, True) for word in words])

print('using all words as features')
evaluate_features(make_full_dict)


using all words as features
train on 7998 instances, test on 2666 instances
accuracy: 0.7475618904726181
pos precision: 0.7511415525114156
pos recall: 0.7404351087771943
neg precision: 0.7440828402366864
neg recall: 0.754688672168042


## Increasing the training set

In [102]:
from sklearn.linear_model import LogisticRegression
def evaluate_features(feature_select):
    #reading pre-labeled input and splitting into lines
    negSentences = open(r"D:\Study\TextAnalytics\Lecture 9\Content\XLect9.Progs\XLect10.Progs\rt-polarity-neg.txt", 'r', encoding='utf8')
    posSentences = open(r"D:\Study\TextAnalytics\Lecture 9\Content\XLect9.Progs\XLect10.Progs\rt-polarity-pos.txt", 'r', encoding='utf8')
    negSentences = re.split(r'\n', negSentences.read())
    posSentences = re.split(r'\n', posSentences.read())
    
    posFeatures = []
    negFeatures = []
    stop_words = stopwords.words('english')
    
    # breaks up the sentences into lists of individual words
    # creates instance structures for classifier
    for i in posSentences:
        posWords = re.findall(r"[\w']+|[.,!?;]", i)
        '''
        for w in posWords:
            if w in stop_words:
                posWords.remove(w)
        posWords = [w.lower() for w in posWords]
        '''
        posWords = [feature_select(posWords), 'pos']
        posFeatures.append(posWords)
    
    for i in negSentences:
        negWords = re.findall(r"[\w']+|[.,!?;]", i)
        '''
        for w in negWords:
            if w in stop_words:
                negWords.remove(w)
        negWords = [w.lower() for w in negWords]
        '''
        negWords = [feature_select(negWords), 'neg']
        negFeatures.append(negWords)

    posCutoff = int(math.floor(len(posFeatures)*0.90))
    negCutoff = int(math.floor(len(negFeatures)*0.90))
    trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
    testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]
    
    #Runs the classifier on the testFeatures
    classifier = NaiveBayesClassifier.train(trainFeatures)

    #Sets up labels to look at output
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)
    
    
    for i, (features, label) in enumerate(testFeatures): # enumerate adds number-count to each item
        referenceSets[label].add(i) 
        predicted = classifier.classify(features) # classifiers' proposed polarity for tests
        testSets[predicted].add(i)

    #Outputs
    print('train on %s instances, test on %s instances'% (len(trainFeatures), len(testFeatures)))
    print('accuracy:', nltk.classify.util.accuracy(classifier, testFeatures))
    print('pos precision:', scores.precision(referenceSets['pos'], testSets['pos']))
    print('pos recall:', scores.recall(referenceSets['pos'], testSets['pos']))
    print('neg precision:', scores.precision(referenceSets['neg'], testSets['neg']))
    print('neg recall:', scores.recall(referenceSets['neg'], testSets['neg']))
    classifier.show_most_informative_features(10)


def make_full_dict(words):
    return dict([(word, True) for word in words])

print('using all words as features')
evaluate_features(make_full_dict)


using all words as features
train on 9596 instances, test on 1068 instances
accuracy: 0.7902621722846442
pos precision: 0.7969348659003831
pos recall: 0.7790262172284644
neg precision: 0.7838827838827839
neg recall: 0.8014981273408239
Most Informative Features
              engrossing = True              pos : neg    =     20.3 : 1.0
                mediocre = True              neg : pos    =     15.7 : 1.0
                 generic = True              neg : pos    =     15.0 : 1.0
              refreshing = True              pos : neg    =     13.7 : 1.0
                 routine = True              neg : pos    =     13.7 : 1.0
                  boring = True              neg : pos    =     13.3 : 1.0
               inventive = True              pos : neg    =     13.0 : 1.0
              disturbing = True              pos : neg    =     13.0 : 1.0
            refreshingly = True              pos : neg    =     12.3 : 1.0
                    dull = True              neg : pos    =     