### Imports

In [1]:
%matplotlib inline

import numpy as np
import NLPlib as nlp

import csv
import itertools

import re
import HTMLParser

import StringIO
import string

from matplotlib import pyplot as plt

### Part 1: Pre-process, tokenize and tag

CSV Format:

0. the polarity of the tweet (0 = negative emotion, 4 = positive emotion)
1. the id of the tweet (e.g., 2087)
2. the date of the tweet (e.g., Sat May 16 23:58:44 UTC 2009)
3. the query (e.g., lyx). If there is no query, then this value is NO QUERY. 
4. the user that tweeted (e.g., robotickilldozr)
5. the text of the tweet (e.g., Lyx is cool)

In [2]:
GID = 4
class_zero_data = [GID * 5500, GID * 5500 + 5] # (GID + 1) * 5500 - 1]
class_four_data = [GID * 5500 + 800000, GID * 5500 + 800000 + 5] # (GID + 1) * 5500 - 1 + 800000]

In [3]:
string_buf = StringIO.StringIO()

In [5]:
with open('datasets/training.1600000.processed.noemoticon.csv', 'rb') as train_file:
    reader = csv.reader(train_file)
    
    for row in itertools.islice(reader, *class_zero_data):
        print row[5]
    for row in itertools.islice(reader, *class_four_data):
        print row[5]

@Natalia_Bella not much to buy now Woolworth closed down 
Kill me please -.- ...Oh crap school tommorow 
@chriskeating re the labour general secretary meeting with Labour PM's aide - I posted the very same on facebook. BBC gone downhill 
Whole day of homework ahead of name 
hamlet...romeo n juliet...radio:ACTIVE live at Wembley...McFly tour DVD's too money to me 
staying at home like the good girls do 
Morning world. It's a beautiful day  Here's hoping for some pathetic fallacy
Oh, how I wish @johncmayer  would say hello to me on a tweet.  That man is a God in my eyes...and ugh, the body....okay I'm done 
Just woke up. Eating sandwiches and drinking coffee  Oh yeaa..
@JimLundy  we have made it very easy for them to catch up  http://bit.ly/5TUpg


In [6]:
tweet_buf = StringIO.StringIO

In [7]:
for line in string_buf.getvalue().split('\n'):
    print line




##### 1. All html tags and attributes (i.e., /<[^>]+>/) are removed.

In [8]:
def strip_html_tags(tweet):
    return re.sub(r'<[^>]+>', '', tweet)

In [9]:
strip_html_tags('<a href="foo.com" class="bar">I Want This <b>text!</b></a>')

'I Want This text!'

##### 2. Html character codes (i.e., &...;) are replaced with an ASCII equivalent.
- Remove the ascii encoding to support extended in unicode

In [10]:
def replace_html_codes(tweet):
    parser = HTMLParser.HTMLParser()
    tweet = filter(lambda x: x in string.printable, tweet)
    return parser.unescape(tweet).encode('ascii', 'ignore')

In [11]:
print replace_html_codes('&quot;You win &pound;100 &iexcl; &quot; &#36;')

"You win 100  " $


In [12]:
print replace_html_codes("We�re back at work")

Were back at work


##### 3. All URLs (i.e., tokens beginning with http or www) are removed.

In [13]:
def remove_urls(tweet):
    # Note that this will modify the whitespace when words are separated by
    # more than one space, but that shouldn't matter as we are tokenizing
    # the tweets anyways
    
    return ' '.join(filter(lambda x : not x.lower().startswith(('www', 'http')), tweet.split(' ')))

In [14]:
remove_urls("brad is the best www.youtube.com dancer but not the worst http://www.google.ca singer")

'brad is the best dancer but not the worst singer'

##### 4. The first character in Twitter user names (@) and hash tags (#) are removed.

In [15]:
def remove_hashtags(tweet):
    return ' '.join([ x[1:] if  x.startswith(('@', '#')) else x for x in tweet.split(' ')])

In [16]:
remove_hashtags('brad #donkey @kick face #@other @#test')

'brad donkey kick face @other #test'

##### 5. Each sentence within a tweet is on its own line.

In [50]:
def create_abbrev_set(file_path='Wordlists/abbrev.english'):
    abbrev_set = set()
    
    with open(file_path, 'rb') as abbrevs:
        for line in abbrevs:
            abbrev_set.add(line.strip())
            #print line.strip()
    
    return abbrev_set

In [18]:
abbrev_set = create_abbrev_set()

In [19]:
print 'Mr.' in abbrev_set
print 'Pa.' in abbrev_set
print 'Pr.' in abbrev_set

True
True
False


In [109]:
def split_by_sentence(tweet):
    '''
        # 1. Anything ending in .?! declared a sentence
        # 2. Sentence boundary moved after quotation mark, if any ex. He said, "I am coming."
        # 3. Period boundary is disqualified if it preceded by an element in abbrev_set
        #    <We could look for capitals after an EOS, but nobody uses capitals on twitter>
        #    <Both sides of :;- could also be thought of as sentence>
    '''
    
    tweet = re.sub(r' +', ' ', tweet).strip()
    if 0 == len(tweet): return [tweet]
    
    abbrev_set = create_abbrev_set()
    split_by_space = tweet.split(' ')

    quote_eos = lambda x: len(x) > 1 and (x[-2:] in {'."', '?"', '!"'} or x[-2:] in {".'", "?'", "!'"})
    eos = lambda x: (x[-1] in {'.', '?', '!'} and x not in abbrev_set) or (quote_eos(x) and x[:-1] not in abbrev_set)
    eos_indices = [i + 1 for i, x in enumerate(split_by_space) if eos(x)]
    
    if 0 == len(eos_indices):
        return [tweet]
    
    sents = [' '.join(x) for x in [split_by_space[i:j] for i, j in zip([0] + eos_indices[:-1], eos_indices)]]

    if eos_indices[-1] < len(split_by_space):
        sents = sents + [' '.join(split_by_space[eos_indices[-1]:])]

    return sents

In [21]:
sample_tweet = '4km technique swim set done Mr." Meeting with a creative director at 10am." Photographs to the printers. A million phone calls to make.'

In [22]:
split_by_sentence(sample_tweet)

['4km technique swim set done Mr." Meeting with a creative director at 10am."',
 'Photographs to the printers.',
 'A million phone calls to make.']

In [23]:
test = "Meech13 damn it!!!!, ¡ half. Mr. Mr.' way text! through the interview @the sound cut's out, just' as u talk abt books!"
test

"Meech13 damn it!!!!, \xc2\xa1 half. Mr. Mr.' way text! through the interview @the sound cut's out, just' as u talk abt books!"

In [24]:
test1 = split_by_sentence(test)
test1

['Meech13 damn it!!!!, \xc2\xa1 half.',
 "Mr. Mr.' way text!",
 "through the interview @the sound cut's out, just' as u talk abt books!"]

In [25]:
test2 = split_by_sentence("brad is the best")
test2

['brad is the best']

In [26]:
test3 = "SpecialEmily aw he says thank you! Yea its lush here got dress&flipflops on but i broke my sunnies gettin new ones  the beach tho"
test3

'SpecialEmily aw he says thank you! Yea its lush here got dress&flipflops on but i broke my sunnies gettin new ones  the beach tho'

In [27]:
split_by_sentence(test3)

['SpecialEmily aw he says thank you!',
 'Yea its lush here got dress&flipflops on but i broke my sunnies gettin new ones the beach tho']

In [110]:
split_by_sentence("")

['']

##### 6/7. Each token, including punctuation and clitics, is separated by spaces.
- Clitics: contracted forms of words, such as n’t
- 's on possessive (ie. Brad's) different from 's on clitics (ie. What's), but both separated
- Must also separate possessive on plurals (ie. dogs ')
- Ellipsis (i.e., ‘...’), and other kinds of multiple punctuation (e.g., ‘!!!’) are not split.
- Don't split e.g. into tokens

In [28]:
def split_tokens(sentence):
    # 1. Split on all punctuation symbols, where a given symbol is repeated once or more
    sentence_1 = re.sub(r"((["+ string.punctuation + "])\\2*)", r" \1 ", sentence).strip()
    sentence_1 = ' '.join(sentence_1.split('  '))
    
    # 2. Join clitics and contractions where ' occurs mid-word
    sentence_2 = re.sub(r"(') ([A-Za-z] )", r"\1\2", sentence_1)
    
    # 3. Join e.g.
    sentence_3 = re.sub(r" e . g . ", r" e.g. ", sentence_2)
    
    return sentence_3

In [29]:
sample_text = "... Brad's dog hasn't said that the cereal is the dogs' or anyone elses..., but, we know  e.g. better????"

In [30]:
split_tokens(sample_text)

"... Brad 's dog hasn 't said that the cereal is the dogs ' or anyone elses ... , but , we know e.g. better ????"

##### 8. Each token is tagged with its part-of-speech.

In [31]:
def tag_sentence(sentence, pos_tagger):
    '''
    Assume sentence is already separated into tokens
    '''
    split = sentence.split(' ')
    return ' '.join([x[0] + "/" + x[1] for x in zip(split, pos_tagger.tag(split))])

In [32]:
tagger = nlp.NLPlib()

unpickle the dictionary
Initialized lexHash from pickled data.


In [33]:
sentence1 = "Meet me today at the FEC in DC at 4 ."
sentence2 = "Wear a carnation so I know it 's you ."

In [34]:
# Expected:
# Meet/VB me/PRP today/NN at/IN the/DT FEC/NN in/IN DC/NN at/IN 4/NN ./.
# Wear/VB a/DT carnation/NN so/RB I/PRP know/VB it/PRP ’s/POS you/PRP ./.
print tag_sentence(sentence1, tagger)
print tag_sentence(sentence2, tagger)

Meet/VB me/PRP today/NN at/IN the/DT FEC/NN in/IN DC/NN at/IN 4/NN ./.
Wear/VB a/DT carnation/NN so/RB I/PRP know/VB it/PRP 's/POS you/PRP ./.


##### 9. Before each tweet is demarcation A=# in <> which occurs on its own line, where # is the numeric class of the tweet (0, 2, or 4).

In [35]:
def add_class(sentences, class_):
    prepend = "<A={}>".format(class_)
    return [prepend] + sentences

In [36]:
add_class([sentence1, sentence2], 4)

['<A=4>',
 'Meet me today at the FEC in DC at 4 .',
 "Wear a carnation so I know it 's you ."]

##### Putting it all together

In [37]:
test_tweet = "@Meech13 damn it!!!!, half. Mr. Mr.' way <b>text!</b> through http://www.google.ca #the      interview #@the sound cut's out, just' as&#36; u talk abt books!   "
test_class = 0

In [171]:
def preprocess(tweet, t_class, tagger):
    tweet = re.sub(r' +', ' ', tweet).strip()
    tweet = strip_html_tags(tweet)
    tweet = replace_html_codes(tweet)
    tweet = remove_urls(tweet)
    tweet = remove_hashtags(tweet)
    
    sentences = split_by_sentence(tweet)
    
    if len(tweet) > 0:
        sentences = [split_tokens(sentence) for sentence in sentences]
        sentences = [tag_sentence(sentence, tagger) for sentence in sentences]
        
    sentences = add_class(sentences, t_class)
    
    return sentences

In [39]:
tagger = nlp.NLPlib()


unpickle the dictionary
Initialized lexHash from pickled data.


In [40]:
preprocess(test_tweet, test_class, tagger)

['<A=0>',
 'Meech13/NN damn/JJ it/PRP !!!!/NN ,/, half/NN ./.',
 "Mr/NNP ./. Mr/NNP ./. '/POS way/NN text/NN !/.",
 "through/IN the/DT interview/NN @/IN the/DT sound/NN cut/VB 's/POS out/IN ,/, just/RB '/POS as/IN $/$ u/PRP talk/VB abt/NN books/NNS !/."]

In [170]:
preprocess("www.google.com", 0, tagger)

['<A=0>', '']

In [172]:
def twtt(output_file, input_file='datasets/training.1600000.processed.noemoticon.csv', GID=-1):
    
    class_slices = []
    if (GID > 0):
        class_slices.append([GID * 5500,          (GID + 1) * 5500])
        class_slices.append([GID * 5500 + 800000, (GID + 1) * 5500 + 800000])
    else:
        class_slices.append([0, None])
    
    with open(input_file, 'r') as train_file:
        reader = csv.reader(train_file)
        tagger = nlp.NLPlib()
        
        for class_slice in class_slices:
            
            for row in itertools.islice(reader, *class_slice):
                
                tweet = row[5]
                t_class = int(row[0])
                
                try:
                    sentences = preprocess(tweet, t_class, tagger)
                except Exception:
                    print "Couldn't pre-process <<<" + tweet + ">>>, skipping..."
                else:
                    for sentence in sentences:
                        output_file.write(sentence + '\n')

        return output_file

In [58]:
with open('datasets/testdata.manualSUBSET.2009.06.14.csv') as test_file:
    reader = csv.reader(test_file)
    indices = [0, None]
    for row in itertools.islice(reader, *indices):
        print row

['4', '3', 'Mon May 11 03:17:40 UTC 2009', 'kindle2', 'tpryan', '@stellargirl I loooooooovvvvvveee my Kindle2. Not that the DX is cool, but the 2 is fantastic in its own right.']
['4', '4', 'Mon May 11 03:18:03 UTC 2009', 'kindle2', 'vcu451', 'Reading my kindle2...  Love it... Lee childs is good read.']
['4', '5', 'Mon May 11 03:18:54 UTC 2009', 'kindle2', 'chadfu', 'Ok, first assesment of the #kindle2 ...it fucking rocks!!!']
['4', '6', 'Mon May 11 03:19:04 UTC 2009', 'kindle2', 'SIX15', "@kenburbary You'll love your Kindle2. I've had mine for a few months and never looked back. The new big one is huge! No need for remorse! :)"]
['4', '7', 'Mon May 11 03:21:41 UTC 2009', 'kindle2', 'yamarama', "@mikefish  Fair enough. But i have the Kindle2 and I think it's perfect  :)"]
['4', '8', 'Mon May 11 03:22:00 UTC 2009', 'kindle2', 'GeorgeVHulme', "@richardebaker no. it is too big. I'm quite happy with the Kindle2."]
['0', '9', 'Mon May 11 03:22:30 UTC 2009', 'aig', 'Seth937', 'Fuck this econ

In [247]:
output_buf = StringIO.StringIO()
twtt(output_buf, 'datasets/testdata.manualSUBSET.2009.06.14.csv')

unpickle the dictionary
Initialized lexHash from pickled data.


<StringIO.StringIO instance at 0x1065b5b48>

In [248]:
print output_buf.getvalue()

<A=4>
stellargirl/NN I/PRP loooooooovvvvvveee/NN my/PRP$ Kindle2/NN ./.
Not/RB that/IN the/DT DX/NNP is/VBZ cool/JJ ,/, but/CC the/DT 2/NN is/VBZ fantastic/JJ in/IN its/PRP$ own/JJ right/NN ./.
<A=4>
Reading/VBG my/PRP$ kindle2/NN .../:
Love/NNP it/PRP .../:
Lee/NNP childs/NNS is/VBZ good/JJ read/VB ./.
<A=4>
Ok/NNP ,/, first/JJ assesment/NN of/IN the/DT kindle2/NN .../: it/PRP fucking/VBG rocks/NNS !!!/NN
<A=4>
kenburbary/NN You/PRP '/POS ll/NN love/NN your/PRP$ Kindle2/NN ./.
I/PRP '/POS ve/NN had/VBD mine/NN for/IN a/DT few/JJ months/NNS and/CC never/RB looked/VBD back/RB ./.
The/DT new/JJ big/JJ one/CD is/VBZ huge/JJ !/.
No/DT need/VBN for/IN remorse/NN !/.
:/: )/)
<A=4>
mikefish/NN Fair/NNP enough/RB ./.
But/CC i/NN have/VBP the/DT Kindle2/NN and/CC I/PRP think/VBP it/PRP 's/POS perfect/JJ :/: )/)
<A=4>
richardebaker/NN no/DT ./.
it/PRP is/VBZ too/RB big/JJ ./.
I/PRP 'm/VBP quite/RB happy/JJ with/IN the/DT Kindle2/NN ./.
<A=0>
Fuck/VB this/DT economy/NN ./.
I/PRP hate/VBP aig/NN a

In [251]:
split = output_buf.getvalue().split('<A=')
split[:2]

['',
 '4>\nstellargirl/NN I/PRP loooooooovvvvvveee/NN my/PRP$ Kindle2/NN ./.\nNot/RB that/IN the/DT DX/NNP is/VBZ cool/JJ ,/, but/CC the/DT 2/NN is/VBZ fantastic/JJ in/IN its/PRP$ own/JJ right/NN ./.\n']

In [252]:
len(split)

360

##### Create tweet files

In [98]:
with open("datafiles/train_group.twt", "w") as train_file:
    twtt(train_file, GID=4)

unpickle the dictionary
Initialized lexHash from pickled data.


In [95]:
with open("datafiles/test.twt", "w") as test_file:
    twtt(test_file, "datasets/testdata.manualSUBSET.2009.06.14.csv")

unpickle the dictionary
Initialized lexHash from pickled data.


In [173]:
with open("datafiles/train_all.twt", "w") as train_file:
    twtt(train_file)

unpickle the dictionary
Initialized lexHash from pickled data.


### Part 2: Feature Extraction


@relation weather

@attribute outlook {sunny, overcast, rainy}

@attribute temperature numeric

@attribute humidity numeric

@attribute windy {TRUE, FALSE}

@attribute play {yes, no}

@data
sunny,85,85,FALSE,no
sunny,80,90,TRUE,no

In [550]:
sample_parser_output = """<A=4>
Meet/VB me/PRP today/NN at/IN the/DT FEC/NN in/IN DC/NN at/IN 4/NN ./.
Wear/VB a/DT carnation/NN so/RB I/PRP know/VB it/PRP 's/POS you/PRP ./."""

In [547]:
sample_parser_output

"<A=4>\nMeet/VB me/PRP today/NN at/IN the/DT FEC/NN in/IN DC/NN at/IN 4/NN ./.\nWear/VB a/DT carnation/NN so/RB I/PRP know/VB it/PRP 's/POS you/PRP ./."

In [299]:
def prep_arff(output_file):
    output_file.write("@relation sentiment\n\n")
    
    feature_set = [
                "first_person_pronouns", 
                "second_person_pronouns", 
                "third_person_pronouns",
                "coordinating_conjunctions",
                "past_tense_verbs",
                "future_tense_verbs",
                "commas",
                "colons",
                "dashes",
                "parantheses",
                "ellipses",
                "common_nouns",
                "proper_nouns",
                "adverbs",
                "wh_words",
                "slang_acronyms",
                "upper_case_words",
                "sentence_length",
                "token_length",
                "number_sentences"
               ]
    
    for feature in feature_set:
        output_file.write("@attribute " + feature + " numeric\n")
        
    output_file.write("@attribute class {0, 4}\n\n")
    
    output_file.write("@data\n")

In [302]:
def compute_feature_vector(sentences, label):
    feature_string = ""
    if 0 == len(sentences):
        return feature_string
    
    if "\n" == sentences[0]:
        #print "Empty sentences"
        return "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0," + str(label) + "\n"
    
    function_set = [
                    first_person_pronouns, 
                    second_person_pronouns, 
                    third_person_pronouns,
                    coordinating_conjunctions,
                    past_tense_verbs,
                    future_tense_verbs,
                    commas,
                    colons,
                    dashes,
                    parantheses,
                    ellipses,
                    common_nouns,
                    proper_nouns,
                    adverbs,
                    wh_words,
                    slang_acronyms,
                    upper_case_words,
                    sentence_length,
                    token_length,
                    number_sentences
                   ]
    
    sentence_split = split_sentences(sentences)
    for function in function_set:
        feature_string += str(function(sentences, sentence_split)) + ','
        
    feature_string += str(label)
    
    return feature_string + "\n"

In [221]:
compute_feature_vector([""], 4)

'@data 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4'

In [304]:
def buildarff(input_file, output_file, max_per_class=50):

    prep_arff(output_file)
    
    sentence_container = []

    class_label = -1
    for line in input_file: #.getvalue().split('\n')[:-1]:

        if line.startswith('<A='):

            class_label = int(line[3])
            try:
                feature_vector = compute_feature_vector(sentence_container, class_label)
            except Exception:
                print sentence_container
            else:
                output_file.write(feature_vector)
            sentence_container = []
        else:
            sentence_container.append(line)

    feature_vector = compute_feature_vector(sentence_container, class_label)
    output_file.write(feature_vector)

In [265]:
arff_buf = StringIO.StringIO()

In [266]:
buildarff(output_buf, arff_buf)

Brad
359


In [290]:
print arff_buf.getvalue().count("@data")

359


##### Using actual files

In [305]:
with open("datafiles/test.twt", "r") as _input:
    with open("datafiles/test.arff", "w") as _output:
        buildarff(_input, _output)

In [306]:
with open("datafiles/train_group.twt", "r") as _input:
    with open("datafiles/train_group.arff", "w") as _output:
        buildarff(_input, _output)

In [307]:
with open("datafiles/train_all.twt", "r") as _input:
    with open("datafiles/train_all.arff", "w") as _output:
        buildarff(_input, _output)

##### Per-feature functions

In [181]:
test_sentences = ["Meet/VB me/PRP today/NN at/IN the/DT FEC/NN in/IN DC/NN at/IN 4/NN ./.", "Wear/VB a/DT carnation/NN so/RB I/PRP know/VB it/PRP 's/POS you/PRP ./."]
test_sentences

['Meet/VB me/PRP today/NN at/IN the/DT FEC/NN in/IN DC/NN at/IN 4/NN ./.',
 "Wear/VB a/DT carnation/NN so/RB I/PRP know/VB it/PRP 's/POS you/PRP ./."]

In [182]:
def split_sentences(sentences):
    tokens = [x.strip().split(' ') for x in sentences]
    return [y.split('/') for x in tokens for y in x]

In [183]:
split_sentences([""])

[['']]

In [184]:
split_sentences(test_sentences)

[['Meet', 'VB'],
 ['me', 'PRP'],
 ['today', 'NN'],
 ['at', 'IN'],
 ['the', 'DT'],
 ['FEC', 'NN'],
 ['in', 'IN'],
 ['DC', 'NN'],
 ['at', 'IN'],
 ['4', 'NN'],
 ['.', '.'],
 ['Wear', 'VB'],
 ['a', 'DT'],
 ['carnation', 'NN'],
 ['so', 'RB'],
 ['I', 'PRP'],
 ['know', 'VB'],
 ['it', 'PRP'],
 ["'s", 'POS'],
 ['you', 'PRP'],
 ['.', '.']]

In [292]:
def first_person_pronouns(sentences, token_split):
    candidate_words = ['i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours']
    return [x[0].lower() in candidate_words for x in token_split].count(True)

In [186]:
first_person_pronouns(test_sentences, split_sentences(test_sentences))

1

In [187]:
def second_person_pronouns(sentences, token_split):
    candidate_words = ['you', 'your', 'yours', 'u', 'ur', 'urs']
    return [x[0].lower() in candidate_words for x in token_split].count(True)

In [188]:
second_person_pronouns(test_sentences, split_sentences(test_sentences))

1

In [189]:
def third_person_pronouns(sentences, token_split):
    candidate_words = ['he', 'him', 'his', 'she', 'her', 'hers', 'it', 'its', 'they', 'them', 'their', 'theirs']
    return [x[0].lower() in candidate_words for x in token_split].count(True)

In [190]:
third_person_pronouns(test_sentences, split_sentences(test_sentences))

1

In [191]:
def coordinating_conjunctions(sentences, token_split):
    candidate_words = ['CC']
    return [x[1] in candidate_words for x in token_split].count(True)

In [192]:
coordinating_conjunctions(test_sentences, split_sentences(test_sentences))

0

In [193]:
def past_tense_verbs(sentences, token_split):
    candidate_words = ['VBD']
    return [x[1] in candidate_words for x in token_split].count(True)

In [194]:
past_tense_verbs(test_sentences, split_sentences(test_sentences))

0

In [195]:
def future_tense_verbs(sentences, token_split):
    candidate_words = ["'ll", 'will', 'gonna']
    count = [x[0].lower() in candidate_words for x in token_split].count(True)
    
    # We also want to count sequences of going+to+VB
    count += [token_split[i][0].lower() == 'going' and token_split[i + 1][0].lower() == 'to' and token_split[i + 2][1] == 'VB' for i in range(len(token_split) - 2)].count(True)
    return count

In [196]:
future_tense_verbs(test_sentences, split_sentences(test_sentences))

0

In [197]:
def commas(sentences, token_split):
    candidate_words = [',']
    return [x[1] in candidate_words for x in token_split].count(True)

In [198]:
commas(test_sentences, split_sentences(test_sentences))

0

In [199]:
def colons(sentences, token_split):
    candidate_words = [':', ';']
    return [x[0] in candidate_words for x in token_split].count(True)

In [200]:
colons(test_sentences, split_sentences(test_sentences))

0

In [201]:
def dashes(sentences, token_split):
    candidate_words = ['-']
    return [x[0] in candidate_words for x in token_split].count(True)

In [202]:
def parantheses(sentences, token_split):
    candidate_words = ['(', ')']
    return [x[0] in candidate_words for x in token_split].count(True)

In [203]:
def ellipses(sentences, token_split):
    candidate_words = ['...']
    return [x[0] in candidate_words for x in token_split].count(True)

In [204]:
def common_nouns(sentences, token_split):
    candidate_words = ['NN', 'NNS']
    return [x[1] in candidate_words for x in token_split].count(True)

In [205]:
def proper_nouns(sentences, token_split):
    candidate_words = ['NNP', 'NNPS']
    return [x[1] in candidate_words for x in token_split].count(True)

In [206]:
def adverbs(sentences, token_split):
    candidate_words = ['RB', 'RBR', 'RBS']
    return [x[1] in candidate_words for x in token_split].count(True)

In [207]:
def wh_words(sentences, token_split):
    candidate_words = ['WDT', 'WP', 'WP$', 'WRB']
    return [x[1] in candidate_words for x in token_split].count(True)

In [208]:
def slang_acronyms(sentences, token_split):
    candidate_words = ['smh', 'fwb',  'lmfao', 'lmao', 'lms', 'tbh',  'rofl', 'wtf',
                       'bff', 'wyd',  'lylc',  'brb',  'atm', 'imao', 'sml',  'btw',
                       'bw',  'imho', 'fyi',   'ppl',  'sob', 'ttyl', 'imo',  'ltr',
                       'thx', 'kk',   'omg',   'ttys', 'afn', 'bbs',  'cya',  'ez',
                       'f2f', 'gtr',  'ic',    'jk',   'k',   'ly',   'ya',   'nm',  'np',
                       'plz', 'ru',   'so',    'tc',   'tmi', 'ym',   'ur',   'u',   'sol']
    return [x[0].lower() in candidate_words for x in token_split].count(True)

In [209]:
slang_acronyms(test_sentences, split_sentences(test_sentences))

1

In [210]:
def upper_case_words(sentences, token_split):
    return [x[0].isupper() and len(x[0]) > 1 for x in token_split].count(True)

In [211]:
upper_case_words(test_sentences, split_sentences(test_sentences))

2

In [212]:
def sentence_length(sentences, token_split):
    return len(token_split) / float(len(sentences))

In [213]:
sentence_length(test_sentences, split_sentences(test_sentences))

10.5

In [214]:
def token_length(sentences, token_split):
    candidate_words = ['#', '$', '.', ',', ':', '(', ')', '"', 'POS']
    token_lengths = [len(x[0]) for x in token_split if x[1] not in candidate_words]
    return sum(token_lengths) / float(len(token_lengths))

In [215]:
token_length(test_sentences, split_sentences(test_sentences))

2.888888888888889

In [216]:
def number_sentences(sentences, token_split):
    return len(sentences)

In [217]:
number_sentences(test_sentences, split_sentences(test_sentences))

2

### Classifying using WEKA

### IBM Watson NLP Classifier

{
  "credentials": {
    "url": "https://gateway.watsonplatform.net/natural-language-classifier/api",
    "username": "2bd0e6c7-5784-4967-860c-a9778754fdee",
    "password": "rFs4Solusscl"
  }
}