In [268]:
from nltk import ngrams
from nltk.corpus import stopwords 
import string
import pandas

In [269]:
stopwords_english = stopwords.words('english')

In [270]:
# clean words, i.e. remove stopwords and punctuation
def clean_words(words, stopwords_english):
    words_clean = []
#     print(type(words))
    for word in words:
        word = word.lower()
        if word not in stopwords_english and word not in string.punctuation:
            words_clean.append(word)    
    return words_clean 

In [271]:
# feature extractor function for unigram
def bag_of_words(words):    
    words_dictionary = dict([word, True] for word in words)    
    return words_dictionary

In [272]:
# feature extractor function for ngrams (bigram)
def bag_of_ngrams(words, n=2):
    words_ng = []
    for item in iter(ngrams(words, n)):
        words_ng.append(item)
    words_dictionary = dict([word, True] for word in words_ng)    
    return words_dictionary

In [273]:
from nltk.tokenize import word_tokenize
text = "It was a very good movie."
words = word_tokenize(text.lower())

In [274]:
words

['it', 'was', 'a', 'very', 'good', 'movie', '.']

In [275]:
bag_of_ngrams(words)

{('it', 'was'): True,
 ('was', 'a'): True,
 ('a', 'very'): True,
 ('very', 'good'): True,
 ('good', 'movie'): True,
 ('movie', '.'): True}

In [276]:
words_clean = clean_words(words, stopwords_english)
print (words_clean)

['good', 'movie']


In [277]:
important_words = ['above', 'below', 'off', 'over', 'under', 'more', 'most', 'such', 'no', 'nor', 'not', 'only', 'so', 'than', 'too', 'very', 'just', 'but']

In [278]:
stopwords_english_for_bigrams = set(stopwords_english) - set(important_words)
 
words_clean_for_bigrams = clean_words(words, stopwords_english_for_bigrams)
print (words_clean_for_bigrams)

['very', 'good', 'movie']


In [279]:
unigram_features = bag_of_words(words_clean)
unigram_features

{'good': True, 'movie': True}

In [280]:
bigram_features = bag_of_ngrams(words_clean_for_bigrams)
bigram_features

{('very', 'good'): True, ('good', 'movie'): True}

In [281]:
# combine both unigram and bigram features
all_features = unigram_features.copy()
all_features.update(bigram_features)
all_features

{'good': True, 'movie': True, ('very', 'good'): True, ('good', 'movie'): True}

In [282]:
# let's define a new function that extracts all features
# i.e. that extracts both unigram and bigrams features
def bag_of_all_words(words, n=2):
    words_clean = clean_words(words, stopwords_english)
    words_clean_for_bigrams = clean_words(words, stopwords_english_for_bigrams)
 
    unigram_features = bag_of_words(words_clean)
    bigram_features = bag_of_ngrams(words_clean_for_bigrams)
 
    all_features = unigram_features.copy()
    all_features.update(bigram_features)
 
    return all_features

In [283]:
bag_of_all_words(words)

{'good': True, 'movie': True, ('very', 'good'): True, ('good', 'movie'): True}

In [284]:
from nltk.corpus import movie_reviews 
 
pos_reviews = []
for fileid in movie_reviews.fileids('pos'):
    words = movie_reviews.words(fileid)
    pos_reviews.append(words)
 
neg_reviews = []
for fileid in movie_reviews.fileids('neg'):
    words = movie_reviews.words(fileid)
    neg_reviews.append(words)

In [285]:
pos_reviews[0]

['films', 'adapted', 'from', 'comic', 'books', 'have', ...]

In [286]:
# positive reviews feature set
pos_reviews_set = []
for words in pos_reviews:
    pos_reviews_set.append((bag_of_all_words(words), 'pos'))
 
# negative reviews feature set
neg_reviews_set = []
for words in neg_reviews:
    neg_reviews_set.append((bag_of_all_words(words), 'neg'))

In [288]:
tweet_data = pandas.read_csv('clean_tweet2.csv')

In [340]:
pos_reviews = tweet_data[tweet_data.sentiment == 1].text.str.lower().str.split()
pos_reviews = [value for (index, value) in pos_reviews.items()]

omg
handed
hmmmm
thanks
feeling
you
goodbye
uploading
so
do
health
go
bathroom
boom
go
going
always
bend
get
hate
really
jin
just
just
oh
pleased
rose
thanks
that
there
trae
true
wide
yeah
poemsunder
that
goooood
know
the
the
hummin
sigh
aiqht
triiiiiii
hot
beach
what
canaveral
progressing
taylorrhicks
thestreetforce
makes
my
video
am
call
just
and
see
facebook
you
lol
is
tell
told
the
yes
are
hope
awesome
am
pick
think
watch
thanks
followers
ll
yall
congrats


TypeError: 'float' object is not subscriptable

In [325]:
neg_reviews = list(tweet_data[tweet_data.sentiment == 0].text.str.lower().str.split())
# len(neg_reviews)+len(pos_reviews)

In [337]:
# positive reviews feature set
# import numpy as np
# n_features = np.arange(1000,100001,10000)

pos_reviews_set = []
for (index, value) in pos_reviews.items():
    pos_reviews_set.append((bag_of_all_words(value), 'pos'))
 
# # negative reviews feature set
# neg_reviews_set = []
# for words in neg_reviews[:(len(pos_reviews_set)/2)]:
#     neg_reviews_set.append((bag_of_all_words(words), 'neg'))

TypeError: 'float' object is not iterable

In [294]:
pos_reviews_set

[({'omg': True, 'already': True, ('omg', 'already'): True}, 'pos'),
 ({'handed': True,
   'uniform': True,
   'today': True,
   'miss': True,
   'already': True,
   ('handed', 'uniform'): True,
   ('uniform', 'today'): True,
   ('today', 'miss'): True,
   ('miss', 'already'): True},
  'pos'),
 ({'hmmmm': True,
   'wonder': True,
   'number': True,
   ('hmmmm', 'wonder'): True,
   ('wonder', 'number'): True},
  'pos'),
 ({'thanks': True,
   'haters': True,
   'face': True,
   'day': True,
   ('thanks', 'haters'): True,
   ('haters', 'face'): True,
   ('face', 'day'): True},
  'pos'),
 ({'feeling': True,
   'strangely': True,
   'fine': True,
   'gonna': True,
   'go': True,
   'listen': True,
   'semisonic': True,
   'celebrate': True,
   ('feeling', 'strangely'): True,
   ('strangely', 'fine'): True,
   ('fine', 'gonna'): True,
   ('gonna', 'go'): True,
   ('go', 'listen'): True,
   ('listen', 'semisonic'): True,
   ('semisonic', 'celebrate'): True},
  'pos'),
 ({'one': True,
   'see':