In [1]:
from __future__ import print_function
import pandas as pd
import nltk
import sys, os
import  re
import random
import time
from sklearn.svm import SVC
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('rte')
nltk.download('stopwords')

[nltk_data] Downloading package rte to /Users/bhogirala/nltk_data...
[nltk_data]   Package rte is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bhogirala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
class DataUtil :

    def getTrainingAndTestData(self, tweets, K, k):

        from functools import wraps
        
        procTweets = tweets
        stemmer = nltk.stem.PorterStemmer()

        all_tweets = []  # DATADICT: all_tweets =   [ (words, sentiment), ... ]
        for tuple in procTweets.itertuples():
            
            words = [word if (word[0:2] == '__') else word.lower() \
                     for word in tuple[2].split() \
                     if len(word) >= 3]
            words = [stemmer.stem(w) for w in words]  # DATADICT: words = [ 'word1', 'word2', ... ]
            all_tweets.append((words, tuple[1]))

        train_tweets = [x for i, x in enumerate(all_tweets) if i % K != k]
        test_tweets = [x for i, x in enumerate(all_tweets) if i % K == k]


        def get_word_features(words):
            bag = {}
            stop_words = set(stopwords.words('english'))
            filtered_words = [w for w in words if not w in stop_words]
            words_uni = ['has(%s)' % ug for ug in filtered_words]
            for f in words_uni:
                bag[f] = 1

            # bag = collections.Counter(words_uni+words_bi+words_tri)
            return bag

        negtn_regex = re.compile(r"""(?:
            ^(?:never|no|nothing|nowhere|noone|none|not|
                havent|hasnt|hadnt|cant|couldnt|shouldnt|
                wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint
            )$
        )
        |
        n't
        """, re.X)

        pos_regex = re.compile(r"""(?:
                    ^(?:excellent|wow|awesome|happy|cool|good|love|
                        wonderful|amazing|amaze|bliss|enjoy|fantastic|
                        beautiful|beauty|better|doesnt|fun|funny|arent|luck|lucky|
                        nice|super
                    )$
                )
                |
                n't
                """, re.X)

        def get_negation_features(words):
            INF = 0.0
            negtn = [bool(negtn_regex.search(w)) for w in words]

            left = [0.0] * len(words)
            prev = 0.0
            for i in range(0, len(words)):
                if (negtn[i]):
                    prev = 1.0
                left[i] = prev
                prev = max(0.0, prev - 0.1)

            right = [0.0] * len(words)
            prev = 0.0
            for i in reversed(range(0, len(words))):
                if (negtn[i]):
                    prev = 1.0
                right[i] = prev
                prev = max(0.0, prev - 0.1)

            return dict(zip(
                ['neg_l(' + w + ')' for w in words] + ['neg_r(' + w + ')' for w in words],
                left + right))

        def get_positive_features(words):

            bag={}
            for word in words:
                if bool(pos_regex.search(word)):
                    key = 'pos(' + word + ')'
                    bag[key] = 1
            return bag


        def counter(func):  
            @wraps(func)
            def tmp(*args, **kwargs):
                tmp.count += 1
                return func(*args, **kwargs)

            tmp.count = 0
            return tmp

        @counter  
        def extract_features(words):

            features = {}
            negation_features = get_negation_features(words)
            features.update(negation_features)
            postive_features = get_positive_features(words)
            features.update(postive_features)
            word_features = get_word_features(words)
            features.update(word_features)
            sys.stderr.write('\rfeatures extracted for ' + str(extract_features.count) + ' tweets')
            return features

        extract_features.count = 0;
        tweets_processed = 0
        # Apply NLTK's Lazy Map
        print("length of train tweets "+str(len(train_tweets)))
        v_train = nltk.classify.apply_features(extract_features, train_tweets)
        print("length of test tweets " + str(len(test_tweets)))
        v_test = nltk.classify.apply_features(extract_features, test_tweets)
        return (v_train, v_test)

In [3]:
class Classifier:

    def __init__(self):
        print ("Sentiment Classifier Started")

    def get_time_stamp(self):
        return time.strftime("%y%m%d-%H%M%S-%Z")

    def grid(self, alist, blist):
        for a in alist:
            for b in blist:
                yield (a, b)


    NUM_SHOW_FEATURES = 100
    SPLIT_RATIO = 0.9
    FOLDS = 5
    LIST_CLASSIFIERS = ['NaiveBayesClassifier', 'MaxentClassifier','SvmClassifier','DecisiontreeClassifier','RTEClassifier']

    def trainAndClassify(self, tweets, classifier, fileprefix):

        dataUtil  = DataUtil()
        INFO = str(classifier)
        if (len(fileprefix) > 0 and '_' != fileprefix[0]):
            directory = os.path.dirname(fileprefix)
            if not os.path.exists(directory):
                os.makedirs(directory)
            realstdout = sys.stdout
            sys.stdout = open(fileprefix + '_' + INFO + '.txt', 'w')
        print (INFO)
        sys.stderr.write('\n' + '#' * 80 + '\n' + INFO)
        if ('NaiveBayesClassifier' == classifier):
            CLASSIFIER = nltk.classify.NaiveBayesClassifier

            def train_function(v_train):
                return CLASSIFIER.train(v_train)
        elif ('MaxentClassifier' == classifier):
            CLASSIFIER = nltk.classify.MaxentClassifier

            def train_function(v_train):
                return CLASSIFIER.train(v_train, algorithm='GIS', max_iter=10)
        elif ('SvmClassifier' == classifier):
            CLASSIFIER = nltk.classify.scikitlearn.SklearnClassifier(SVC())

            def SvmClassifier_show_most_informative_features(self, n=10):
                print ('unimplemented')

            CLASSIFIER.show_most_informative_features = SvmClassifier_show_most_informative_features

            def train_function(v_train):
                return CLASSIFIER.train(v_train)
        

        elif('DecisiontreeClassifier' == classifier):
            CLASSIFIER = nltk.classify.DecisionTreeClassifier
            
            def DecisionTreeClassifier_show_most_informative_features( self, n=10 ):
                print ('unimplemented')
                    
        
            
            CLASSIFIER.show_most_informative_features = DecisionTreeClassifier_show_most_informative_features
                
            def train_function(v_train):
                    return CLASSIFIER.train(v_train)

        elif ('RTEClassifier' == classifier):
            CLASSIFIER = nltk.classify.rte_classifier('IIS')

            def train_function(v_train):
                return CLASSIFIER.train(v_train)

        accuracies = []
        
        for k in range(self.FOLDS):
                (v_train, v_test) = dataUtil.getTrainingAndTestData(tweets, self.FOLDS, k)

                sys.stderr.write('\n[training start]')
                classifier_tot = train_function(v_train)
                sys.stderr.write(' [training complete]')

                print ('######################')
                print ('1 Step Classifier : ', classifier)
                accuracy_tot = nltk.classify.accuracy(classifier_tot, v_test)
                print ('Accuracy : ', accuracy_tot)
                print ('######################')
                print (classifier_tot.show_most_informative_features(self.NUM_SHOW_FEATURES))
                print ('######################')

                # build confusion matrix over test set
                test_truth = [s for (t, s) in v_test]
                test_predict = [classifier_tot.classify(t) for (t, s) in v_test]

                print ('Accuracy :', accuracy_tot)
                print ('Confusion Matrix ')
                print (nltk.ConfusionMatrix(test_truth, test_predict))

                accuracies.append(accuracy_tot)
        print ("Accuracies:", accuracies)
        print ("Average Accuracy:", sum(accuracies) / self.FOLDS)


        sys.stderr.write('\nAccuracies :')
        for k in range(self.FOLDS):
            sys.stderr.write(' %0.5f' % accuracies[k])
        sys.stderr.write('\nAverage Accuracy: %0.5f\n' % (sum(accuracies) / self.FOLDS))
        sys.stderr.flush()

        sys.stdout.flush()
        if (len(fileprefix) > 0 and '_' != fileprefix[0]):
            sys.stdout.close()
            sys.stdout = realstdout

        return classifier_tot

    def main(self):
        
        fileprefix = 'logs/run'
        tweets = pd.read_csv("./data/preprocessed_tweets_shuffled.csv",encoding='ISO-8859-1',names=["label", "id", "date", "query", "user", "tweet"])
        sys.stderr.write('\nlen( tweets ) = ' + str(len(tweets)))
        TIME_STAMP = self.get_time_stamp()
        for cname in self.LIST_CLASSIFIERS:

                self.trainAndClassify(
                    tweets, classifier=cname,fileprefix=fileprefix + '_' + TIME_STAMP)

In [4]:
classifier = Classifier()
classifier.main()

Sentiment Classifier Started


  if (yield from self.run_code(code, result)):

len( tweets ) = 1600001
################################################################################
NaiveBayesClassifier
features extracted for 16 tweets[training complete]
features extracted for 16 tweets[training complete]
features extracted for 16 tweets[training complete]
features extracted for 16 tweets[training complete]
features extracted for 16 tweets[training complete]
Accuracies : 0.50000 0.00000 1.00000 0.50000 0.50000
Average Accuracy: 0.50000

################################################################################
MaxentClassifier
features extracted for 544 tweets [training complete]
features extracted for 544 tweets [training complete]
features extracted for 544 tweets [training complete]
features extracted for 544 tweets [training complete]
features extracted for 544 tweets [training complete]
Accuracies : 0.50000 0.00000 0.50000 0.50000 0.00000
Average Accuracy: 0.30000

######################################