In [1]:
#Build Your Own Sentiment Analyzer

import pandas as pd     # To handle data
import numpy as np      # For number computing
import csv
import re

In [2]:
#In order to build a our own sentiment analyzer, first we need to equip ourselves with the right tools and methods. Machine learning is one such tool where
#people have developed various methods to classify. Classifiers may or may not need training data. In particular, we will deal with the Naive Bayes Classifier.

###Preprocess tweets
#start replaceTwoOrMore

def replaceTwoOrMore(s):
    #look for 2 or more repetitions of character
    pattern = re.compile(r"(.)\1{1,}", re.DOTALL) 
    return pattern.sub(r"\1\1", s)

def processTweet2(tweet):
    # process the tweets

    #Convert to lower case
    tweet = tweet.lower()
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #Convert @username to AT_USER
    tweet = re.sub('@[^\s]+','AT_USER',tweet)
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    #trim
    tweet = tweet.strip('\'"')
    return tweet   

In [3]:
#start getStopWordList
def getStopWordList(stopWordListFileName):
    #read the stopwords
    stopWords = []
    stopWords.append('AT_USER')
    stopWords.append('URL')

    fp = open(stopWordListFileName, 'r')
    line = fp.readline()
    while line:
        word = line.strip()
        stopWords.append(word)
        line = fp.readline()
    fp.close()
    return stopWords

In [4]:
"""One important step in building a classifier is deciding what features of the input are relevant, and how
to encode those features. For example, we can use the ending letter of the names as a feature and build
a classifier to identify gender with these distinctive features. Specifically, names ending in a, e and i are
likely to be female, while names ending in k, o, r, s and t are likely to be male."""

def getFeatureVector(tweet):
    featureVector = []
    #split tweet into words
    words = tweet.split()
    for w in words:
        #replace two or more with two occurrences
        w = replaceTwoOrMore(w)
        #strip punctuation
        w = w.strip('\'"?,.')
        #check if the word stats with an alphabet
        val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
        #ignore if it is a stop word
        if(w in stopWords or val is None):
            continue
        else:
            featureVector.append(w.lower())
    return featureVector


In [6]:
stopWords = []
stopWords = getStopWordList('/home/ashish/Desktop/FDP_CIT/Python code SA/stopwords.txt')

In [7]:
#Read the twitter-sanders-apple3 dataset using pandas
import re
df = pd.read_csv("/home/ashish/Desktop/FDP_CIT/Python code SA/twitter-sanders-apple3/twitter-sanders-apple3.csv", encoding ="ISO-8859-1")
df

Unnamed: 0,class,text
0,Pos,Now all @Apple has to do is get swype on the i...
1,Pos,@Apple will be adding more carrier support to ...
2,Pos,Hilarious @youtube video - guy does a duet wit...
3,Pos,@RIM you made it too easy for me to switch to ...
4,Pos,I just realized that the reason I got into twi...
5,Pos,I'm a current @Blackberry user little bit disa...
6,Pos,The 16 strangest things Siri has said so far. ...
7,Pos,Great up close & personal event @Apple tonight...
8,Pos,From which companies do you experience the bes...
9,Pos,Just apply for a job at @Apple hope they call ...


In [9]:
# Get tweet words
tweets = []
featureList = []

for i in range(len(df)):
    sentiment = df['class'][i]
    tweet = df['text'][i]
    processedTweet = processTweet2(tweet)
    featureVector = getFeatureVector(processedTweet)
    featureList.extend(featureVector)
    tweets.append((featureVector, sentiment))
tweets

[(['swype', 'iphone', 'crack', 'iphone'], 'Pos'),
 (['adding', 'carrier', 'support', 'iphone'], 'Pos'),
 (['hilarious', 'video', 'guy', 'duet', 'siri', 'pretty', 'sums', 'love'],
  'Pos'),
 (['easy', 'switch', 'iphone'], 'Pos'),
 (['realized', 'reason', 'twitter', 'ios5', 'thanks'], 'Pos'),
 (['current', 'user', 'little', 'bit', 'disappointed', 'move'], 'Pos'),
 (['strangest', 'siri', 'am', 'soo', 'glad', 'siri', 'sense', 'via'], 'Pos'),
 (['close', 'personal', 'event', 'tonight', 'regent', 'st'], 'Pos'),
 (['companies', 'experience', 'customer', 'service', 'aside'], 'Pos'),
 (['apply', 'job', 'hope', 'call', 'lol'], 'Pos'),
 (['rt', 'lmao', 'onto', 'am', 'haha', 'siri', 'suggested', 'whores'], 'Pos'),
 (['lmao',
   'onto',
   'am',
   'haha',
   'siri',
   'suggested',
   'whores',
   'hide',
   'body',
   'lolol'],
  'Pos'),
 (['rt', 'registered', 'developer', 'hoping', 'actually', 'help', 'greatly'],
  'Pos'),
 (['wow',
   'deals',
   'refurbed',
   'ipad',
   'models',
   'apple',


In [10]:
#start extract_features
def extract_features(tweet):
    tweet_words = set(tweet)
    features = {}
    for word in featureList:
        features['contains(%s)' % word] = (word in tweet_words)
    return features

In [11]:
### Remove featureList duplicates
featureList = list(set(featureList))

In [12]:
import nltk
training_set = nltk.classify.util.apply_features(extract_features, tweets)
# Train the classifier Naive Bayes Classifier
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)

In [13]:
# dftest is a dataframe containing all the test tweets
dftest = pd.read_csv("/home/ashish/Desktop/FDP_CIT/Python code SA/testdata.csv", encoding ="ISO-8859-1")


In [14]:
# Use the Naive Bayes Classifier and find the sentiment of test tweets
pred= dftest['text'].apply(lambda tweet: NBClassifier.classify(extract_features(getFeatureVector(processTweet2(tweet)))))


In [15]:
def getAccuracy(testSet, predictions):
	correct = 0
	for i in range(len(testSet)):
		if testSet[i]== predictions[i]:
			correct += 1
	return (correct/float(len(testSet))) * 100.0

In [16]:
org=dftest['class']
accuracy = getAccuracy(org, pred)

In [17]:
print('Accuracy: {0}%'.format(accuracy))

Accuracy: 93.18181818181817%
