In [9]:
import re

#start process_tweet
def processTweet(tweet):
    # process the tweets

    #Convert to lower case
    tweet = tweet.lower()
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #Convert @username to AT_USER
    tweet = re.sub('@[^\s]+','AT_USER',tweet)
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    #trim
    tweet = tweet.strip('\'"')
    return tweet
#end

In [10]:
#initialize stopWords
stopWords = []

#start replaceTwoOrMore
def replaceTwoOrMore(s):
    #look for 2 or more repetitions of character and replace with the character itself
    pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
    return pattern.sub(r"\1\1", s)
#end

#start getStopWordList
def getStopWordList(stopWordListFileName):
    #read the stopwords file and build a list
    stopWords = []
    stopWords.append('AT_USER')
    stopWords.append('URL')

    fp = open(stopWordListFileName, 'r')
    line = fp.readline()
    while line:
        word = line.strip()
        stopWords.append(word)
        line = fp.readline()
    fp.close()
    return stopWords
#end

#start getfeatureVector
def getFeatureVector(tweet,stopWords):
    featureVector = []
    #split tweet into words
    words = tweet.split()
    for w in words:
        #replace two or more with two occurrences
        w = replaceTwoOrMore(w)
        #strip punctuation
        w = w.strip('\'"?,.')
        #check if the word stats with an alphabet
        val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
        #ignore if it is a stop word
        if(w in stopWords or val is None):
            continue
        else:
            featureVector.append(w.lower())
    return featureVector
#end

In [32]:
import csv
#Read the tweets one by one and process it
inpTweets = csv.reader(open('/home/hadoop/Desktop/sampleTweets.csv', 'r'), delimiter=',', quotechar='"')
st = open('/home/hadoop/Desktop/stopwords.txt', 'r')
stopWords = getStopWordList('/home/hadoop/Desktop/stopwords.txt')
featureList = []
tweets = []
for row in inpTweets:
    sentiment = row[0]
    tweet = row[1]
    processedTweet = processTweet(tweet)
    featureVector = getFeatureVector(processedTweet, stopWords)
    featureList.extend(featureVector)
    tweets.append((featureVector, sentiment));

In [33]:
def extract_features(tweet):
    tweet_words = set(tweet)
    features = {}
    for word in featureList:
        features['contains(%s)' % word] = (word in tweet_words)
    return features

In [34]:
print(tweets)



In [35]:
import re
import nltk
from nltk.classify import *

class ClassifierHelper:
    #start __init__
    def __init__(self, featureListFile):
        self.wordFeatures = []
        # Read feature list
        inpfile = open(featureListFile, 'r')
        line = inpfile.readline()        
        while line:
            self.wordFeatures.append(line.strip())
            line = inpfile.readline()
    #end    

    #start extract_features
    def extract_features(self, document):
        document_words = set(document)
        features = {}
        for word in self.wordFeatures:
            word = self.replaceTwoOrMore(word) 
            word = word.strip('\'"?,.')
            features['contains(%s)' % word] = (word in document_words)
        return features
    #end

    #start replaceTwoOrMore
    def replaceTwoOrMore(self, s):
        # pattern to look for three or more repetitions of any character, including
        # newlines.
        pattern = re.compile(r"(.)\1{1,}", re.DOTALL) 
        return pattern.sub(r"\1\1", s)
    #end

    def getSVMFeatureVectorAndLabels(self, tweets):
        sortedFeatures = sorted(self.wordFeatures)
        map = {}
        feature_vector = []
        labels = []
        for t in tweets:
            label = 0
            map = {}
            #Initialize empty map
            for w in sortedFeatures:
                map[w] = 0
            
            tweet_words = t[0]
            tweet_opinion = t[1]
            #Fill the map
            for word in tweet_words:
                word = self.replaceTwoOrMore(word) 
                word = word.strip('\'"?,.')
                if word in map:
                    map[word] = 1
            #end for loop
            values = map.values()
            feature_vector.append(values)
            if(tweet_opinion == 'positive'):
                label = 0
            elif(tweet_opinion == 'negative'):
                label = 1
            elif(tweet_opinion == 'neutral'):
                label = 2
            labels.append(label)            
        return {'feature_vector' : feature_vector, 'labels': labels}
    #end
    
    #start getSVMFeatureVector
    def getSVMFeatureVector(self, tweets):
        sortedFeatures = sorted(self.wordFeatures)
        map = {}
        feature_vector = []
        for t in tweets:
            label = 0
            map = {}
            #Initialize empty map
            for w in sortedFeatures:
                map[w] = 0
            #Fill the map
            for word in t:
                if word in map:
                    map[word] = 1
            #end for loop
            values = map.values()
            feature_vector.append(values)                    
        return feature_vector
    #end
    
    #start process_tweet
    def process_tweet(self, tweet):
        #Conver to lower case
        tweet = tweet.lower()
        #Convert https?://* to URL
        tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
        #Convert @username to AT_USER
        tweet = re.sub('@[^\s]+','AT_USER',tweet)    
        #Remove additional white spaces
        tweet = re.sub('[\s]+', ' ', tweet)
        #Replace #word with word
        tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
        #trim
        tweet = tweet.strip()
        #remove first/last " or 'at string end
        tweet = tweet.rstrip('\'"')
        tweet = tweet.lstrip('\'"')
        return tweet
    #end 
    
    #start is_ascii
    def is_ascii(self, word):
        return all(ord(c) < 128 for c in word)
    #end
#end class

In [36]:
featureList = list(set(featureList))

# Extract feature vector for all tweets in one shote
training_set = nltk.classify.util.apply_features(extract_features, tweets)

In [38]:
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)

# Test the classifier


In [50]:
testTweet = input()
processedTestTweet = processTweet(testTweet)
print (NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet,stopWords))))


  The excitement begins. Look who is coming to @BiggBoss for the launch. 16th October 9 PM - Book your seats Now ! @xxxMovie @deepikapadukone
positive


In [62]:
actual=[]
testTweets = csv.reader(open('/home/hadoop/Desktop/test2.csv', 'r'), delimiter=',', quotechar='"')
tested=[]
for row in testTweets:
    actual.append(row[0])
    testtweet=row[1]
    processedTestTweet = processTweet(testtweet)
    senti=NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet,stopWords)))
    tested.append(senti)

In [63]:
import numpy as np
from sklearn.metrics import accuracy_score
accuracy_score(actual, tested)

0.89090909090909087

In [None]:
MaxEntClassifier = nltk.classify.maxent.MaxentClassifier.train(training_set, 'GIS', trace=3, \
                    encoding=None, labels=None, gaussian_prior_sigma=0, max_iter = 10)

In [None]:
test_ent=[]
for row in testTweets:
    testtweet=row[1]
    processedTestTweet = processTweet(testtweet)
    senti_ent=MaxEntClassifier.classify(extract_features(getFeatureVector(processedTestTweet)))
    test_ent.append(senti_ent)
accuracy_score(actual,test_ent)