In [4]:
import json
import pandas
import nltk.classify.util
from nltk.corpus import twitter_samples
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import TweetTokenizer

## TRAIN CLASSIFIER

In [8]:
def word_feats(words):
    '''
    this function borrowed from: http://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/

    "All of the NLTK classifiers work with featstructs, which can be simple dictionaries
    mapping a feature name to a feature value. For text, we’ll use a simplified
    bag of words model where every word is feature name with a value of True."
    '''

    return dict([(word, True) for word in words])

In [9]:
# pull out tokenized text from the classified tweets provided by NLTK
# more info: http://www.nltk.org/howto/twitter.html#Using-a-Tweet-Corpus
tokenized_negative = twitter_samples.tokenized('negative_tweets.json')
tokenized_positive = twitter_samples.tokenized('positive_tweets.json')

In [10]:
# normalize text by transforming to lowercase
negatives_normalized = [[word.lower() for word in thing] for thing in tokenized_negative]
positives_normalized = [[word.lower() for word in thing] for thing in tokenized_positive]

In [11]:
# run through wordfeats() to convert into featstructs for NLTK classifier
negatives = [(word_feats(negatives_normalized[i]), 'neg') for i in range(len(tokenized_negative))]
positives = [(word_feats(positives_normalized[i]), 'pos') for i in range(len(tokenized_positive))]

In [12]:
# split dataset into 75% train/25% test
neg_split = int(len(negatives) * 0.75)
pos_split = int(len(positives) * 0.75)
train_feats = negatives[:neg_split] + positives[:pos_split]
test_feats = negatives[neg_split:] + positives[pos_split:]
print('train on {} instances, test on {} instances'.format(len(train_feats), len(test_feats)))

train on 7500 instances, test on 2500 instances


In [13]:
classifier = NaiveBayesClassifier.train(train_feats)
print('accuracy: {}'.format(nltk.classify.util.accuracy(classifier, test_feats)))

accuracy: 0.9936


In [15]:
classifier.show_most_informative_features(40)

Most Informative Features
                      :( = True              neg : pos    =   2214.3 : 1.0
                      :) = True              pos : neg    =   1073.8 : 1.0
                    glad = True              pos : neg    =     25.7 : 1.0
                     x15 = True              neg : pos    =     23.7 : 1.0
                 arrived = True              pos : neg    =     21.8 : 1.0
                     sad = True              neg : pos    =     21.2 : 1.0
                    sick = True              neg : pos    =     19.7 : 1.0
               community = True              pos : neg    =     15.7 : 1.0
                   loves = True              pos : neg    =     14.1 : 1.0
                     ugh = True              neg : pos    =     13.7 : 1.0
                    miss = True              neg : pos    =     13.3 : 1.0
                      aw = True              neg : pos    =     13.0 : 1.0
              definitely = True              pos : neg    =     13.0 : 1.0

## CLASSIFY DATASET

In [16]:
deletweet = pandas.read_csv('../data/deleted_tweets.csv', error_bad_lines=False)

b'Skipping line 1157: expected 11 fields, saw 141\nSkipping line 2263: expected 11 fields, saw 77\nSkipping line 2319: expected 11 fields, saw 92\nSkipping line 4631: expected 11 fields, saw 129\nSkipping line 8260: expected 11 fields, saw 89\nSkipping line 8823: expected 11 fields, saw 84\nSkipping line 8824: expected 11 fields, saw 129\nSkipping line 10197: expected 11 fields, saw 131\nSkipping line 10278: expected 11 fields, saw 123\nSkipping line 10297: expected 11 fields, saw 123\nSkipping line 10311: expected 11 fields, saw 123\nSkipping line 10401: expected 11 fields, saw 79\nSkipping line 10430: expected 11 fields, saw 154\nSkipping line 10495: expected 11 fields, saw 92\nSkipping line 12989: expected 11 fields, saw 77\nSkipping line 14473: expected 11 fields, saw 73\nSkipping line 16741: expected 11 fields, saw 79\nSkipping line 22015: expected 11 fields, saw 81\nSkipping line 22322: expected 11 fields, saw 123\nSkipping line 22957: expected 11 fields, saw 74\nSkipping line 24

In [6]:
# construct a list of strings to hold the tweet text
tweet_text_raw = []

for i in range(len(deletweet)):
    if type(deletweet['tweet'][i]) == str:
        tweet = json.loads(deletweet['tweet'][i])
        if type(tweet) == dict:
            tweet_text_raw.append(tweet['text'])

In [7]:
# number of individual tweets to classify
len(tweet_text_raw)

67756

In [28]:
# use the tweet tokenizer (casual.py) provided by NLTK
# preserve_case = False will transform all to lowercase
tknzr = TweetTokenizer(preserve_case=False)

tokenized = [tknzr.tokenize(tweet_text_raw[i]) for i in range(len(tweet_text_raw))]

In [29]:
# run tokenized lists through word_feats() to construct featstruct for NLTK classifier
features = [word_feats(tokenized[i]) for i in range(len(tokenized))] 

In [40]:
classed = classifier.classify_many(features)

In [41]:
classified_tweets = list(zip(classed, tweet_text_raw))

In [42]:
for i in range(30):
    print('{} : {}'.format(classified_tweets[i][0], classified_tweets[i][1]))

neg : This is so cool. This same sort of adaptive protocol is being used with shipping drones as well. https://t.co/pLuU2ljAmg
pos : https://t.co/V7Rc07GrJU
pos : #TBT @MikePenceVP https://t.co/tSZUjMjaaI
pos : I had a cordial and candidate discussion today with the new DHS Secretary, John Kelly. https://t.co/4neFHS3Mji
pos : Grt to host @USProgressives Specl Order w/@RepRaskin on #MuslimBan.Thx @RepMarkTakano @RepLawrence @RepGaramendi @RepCicilline @RepBarragan
pos : I'm an original co-sponsor of @RepDonBeyer’s Freedom of Religon Act, protecting our values in response to @POTUS’ #MuslimBan. #FORAct
neg : ⚡️ “Rieckhoff vs. Tester”

https://t.co/JBr4v7j5PM
pos : @IAVA CEO @PaulRieckhoff &amp; I are going #Head2Head to determine who dons the better ‘do. Post #Jon or #Paul to cast… https://t.co/GemjAHTZf7
pos : @IAVA CEO @PaulRieckhoff &amp; I are going #Head2Head to determine who dons the better ‘do. Post #Jon or #Paul to cast your vote below!
pos : @IAVA CEO @PaulReickhoff &amp; I are 