In [1]:
from nltk import ngrams
from nltk.corpus import stopwords 
import string
import pandas
import numpy as np

In [2]:
stopwords_english = stopwords.words('english')

In [3]:
# clean words, i.e. remove stopwords and punctuation
def clean_words(words, stopwords_english):
    words_clean = []
#     print(type(words))
    for word in words:
        word = word.lower()
        if word not in stopwords_english and word not in string.punctuation:
            words_clean.append(word)    
    return words_clean 

In [4]:
# feature extractor function for unigram
def bag_of_words(words):    
    words_dictionary = dict([word, True] for word in words)    
    return words_dictionary

In [5]:
# feature extractor function for ngrams (bigram)
def bag_of_ngrams(words, n=2):
    words_ng = []
    for item in iter(ngrams(words, n)):
        words_ng.append(item)
    words_dictionary = dict([word, True] for word in words_ng)    
    return words_dictionary

In [6]:
from nltk.tokenize import word_tokenize
text = "It was a very good movie."
words = word_tokenize(text.lower())

In [7]:
words

['it', 'was', 'a', 'very', 'good', 'movie', '.']

In [8]:
bag_of_ngrams(words)

{('it', 'was'): True,
 ('was', 'a'): True,
 ('a', 'very'): True,
 ('very', 'good'): True,
 ('good', 'movie'): True,
 ('movie', '.'): True}

In [9]:
words_clean = clean_words(words, stopwords_english)
print (words_clean)

['good', 'movie']


In [10]:
important_words = ['above', 'below', 'off', 'over', 'under', 'more', 'most', 'such', 'no', 'nor', 'not', 'only', 'so', 'than', 'too', 'very', 'just', 'but']

In [11]:
stopwords_english_for_bigrams = set(stopwords_english) - set(important_words)

In [12]:
words_clean_for_bigrams = clean_words(words, stopwords_english_for_bigrams)
print (words_clean_for_bigrams)

['very', 'good', 'movie']


In [13]:
unigram_features = bag_of_words(words_clean)
unigram_features

{'good': True, 'movie': True}

In [14]:
bigram_features = bag_of_ngrams(words_clean_for_bigrams)
bigram_features

{('very', 'good'): True, ('good', 'movie'): True}

In [15]:
# combine both unigram and bigram features
all_features = unigram_features.copy()
all_features.update(bigram_features)
all_features

{'good': True, 'movie': True, ('very', 'good'): True, ('good', 'movie'): True}

In [16]:
# let's define a new function that extracts all features
# i.e. that extracts both unigram and bigrams features
def bag_of_all_words(words, n=2):
    words_clean = clean_words(words, stopwords_english)
    words_clean_for_bigrams = clean_words(words, stopwords_english_for_bigrams)
 
    unigram_features = bag_of_words(words_clean)
    bigram_features = bag_of_ngrams(words_clean_for_bigrams)
 
    all_features = unigram_features.copy()
    all_features.update(bigram_features)
 
    return all_features

In [17]:
bag_of_all_words(words)

{'good': True, 'movie': True, ('very', 'good'): True, ('good', 'movie'): True}

In [18]:
# from nltk.corpus import movie_reviews 
 
# pos_reviews = []
# for fileid in movie_reviews.fileids('pos'):
# #     print(movie_reviews.words(fileid))
#     words = movie_reviews.words(fileid)
#     pos_reviews.append(words)
 
# neg_reviews = []
# for fileid in movie_reviews.fileids('neg'):
#     words = movie_reviews.words(fileid)
#     neg_reviews.append(words)

In [19]:
# pos_reviews[:5]

In [20]:
# # positive reviews feature set
# pos_reviews_set = []
# for words in pos_reviews:
# #     print(words)
#     pos_reviews_set.append((bag_of_all_words(words), 'pos'))
 
# # negative reviews feature set
# neg_reviews_set = []
# for words in neg_reviews:
#     neg_reviews_set.append((bag_of_all_words(words), 'neg'))

In [21]:
tweet_data = pandas.read_csv('clean_tweet2.csv',index_col=0)
tweet_data.head()

Unnamed: 0,text,sentiment
0,is so sad for my apl friend,0
1,missed the new moon trailer,0
2,omg its already,1
3,omgaga im sooo im gunna cry ve been at this de...,0
4,think mi bf is cheating on me,0


In [22]:
tweet_data.shape

(99988, 2)

In [23]:
tweet_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99988 entries, 0 to 99987
Data columns (total 2 columns):
text         99523 non-null object
sentiment    99988 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.3+ MB


In [24]:
tweet_data[tweet_data.isnull().any(axis=1)].head()

Unnamed: 0,text,sentiment
225,,1
228,,1
233,,0
252,,0
694,,1


In [25]:
np.sum(tweet_data.isnull().any(axis=1))

NameError: name 'np' is not defined

In [None]:
tweet_data.isnull().any(axis=0)

In [None]:
tweet_data.dropna(inplace=True)
tweet_data.reset_index(drop=True,inplace=True)
tweet_data.info()

In [None]:
tweet_data.shape

In [None]:
# import numpy as np
# pos_reviews = np.array([row for row in tweet_data[tweet_data.sentiment == 1].text.str.split()])
pos_reviews = [row for row in tweet_data[tweet_data.sentiment == 1].text.str.split()]
# [value for (index, value) in pos_reviews.items()]
pos_reviews

In [26]:
# import numpy as np
# pos_reviews = np.array([row for row in tweet_data[tweet_data.sentiment == 1].text.str.split()])
neg_reviews = [row for row in tweet_data[tweet_data.sentiment == 0].text.str.split()]
# [value for (index, value) in pos_reviews.items()]
neg_reviews

[['is', 'so', 'sad', 'for', 'my', 'apl', 'friend'],
 ['missed', 'the', 'new', 'moon', 'trailer'],
 ['omgaga',
  'im',
  'sooo',
  'im',
  'gunna',
  'cry',
  've',
  'been',
  'at',
  'this',
  'dentist',
  'since',
  'was',
  'suposed',
  'just',
  'get',
  'crown',
  'put',
  'on',
  'mins'],
 ['think', 'mi', 'bf', 'is', 'cheating', 'on', 'me'],
 ['or', 'just', 'worry', 'too', 'much'],
 ['sunny', 'again', 'work', 'tomorrow', 'tv', 'tonight'],
 ['must', 'think', 'about', 'positive'],
 ['this', 'weekend', 'has', 'sucked', 'so', 'far'],
 ['jb', 'isnt', 'showing', 'in', 'australia', 'any', 'more'],
 ['ok', 'thats', 'it', 'you', 'win'],
 ['this', 'is', 'the', 'way', 'feel', 'right', 'now'],
 ['awhhe',
  'man',
  'completely',
  'useless',
  'rt',
  'now',
  'funny',
  'all',
  'can',
  'do',
  'is',
  'twitter'],
 ['huge', 'roll', 'of', 'thunder', 'just', 'now', 'so', 'scary'],
 ['just',
  'cut',
  'my',
  'beard',
  'off',
  'it',
  'only',
  'been',
  'growing',
  'for',
  'well',
  'ov

In [27]:
pos_reviews_set = []
for words in pos_reviews:
#     print(words)
    pos_reviews_set.append((bag_of_all_words(words), 'pos'))
pos_reviews_set

NameError: name 'pos_reviews' is not defined

In [None]:
neg_reviews_set = []
for words in neg_reviews:
#     print(words)
    neg_reviews_set.append((bag_of_all_words(words), 'neg'))
neg_reviews_set

In [28]:
print (len(pos_reviews_set), len(neg_reviews_set)) # Output: (1000, 1000)
 
# radomize pos_reviews_set and neg_reviews_set
# doing so will output different accuracy result everytime we run the program
from random import shuffle 
shuffle(pos_reviews_set)
shuffle(neg_reviews_set)

test_set = pos_reviews_set[:int((0.2*len(pos_reviews_set)))] + neg_reviews_set[:int((0.2*len(neg_reviews_set)))]
train_set = pos_reviews_set[int((0.2*len(pos_reviews_set))):] + neg_reviews_set[int((0.2*len(neg_reviews_set))):]
 
print(len(test_set),  len(train_set))

NameError: name 'neg_reviews_set' is not defined

In [29]:
from nltk import classify
from nltk import NaiveBayesClassifier
 
classifier = NaiveBayesClassifier.train(train_set)
 
accuracy = classify.accuracy(classifier, test_set)
print(accuracy)
 
print (classifier.show_most_informative_features(10)) 

NameError: name 'train_set' is not defined

In [30]:
from nltk.tokenize import word_tokenize
 
custom_review = "I hated the film. It was a disaster. Poor direction, bad acting."
custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_all_words(custom_review_tokens)
print (classifier.classify(custom_review_set)) # Output: neg
# Negative review correctly classified as negative
 
# probability result
prob_result = classifier.prob_classify(custom_review_set)
print (prob_result) # Output: <ProbDist with 2 samples>
print (prob_result.max()) # Output: neg
print (prob_result.prob("neg")) # Output: 0.770612685688
print (prob_result.prob("pos")) # Output: 0.229387314312
 
 
custom_review = "It was a wonderful and amazing movie. I loved it. Best direction, good acting."
custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_all_words(custom_review_tokens)
 
print (classifier.classify(custom_review_set)) # Output: pos
# Positive review correctly classified as positive
 
# probability result
prob_result = classifier.prob_classify(custom_review_set)
print (prob_result) # Output: <ProbDist with 2 samples>
print (prob_result.max()) # Output: pos
print (prob_result.prob("neg")) # Output: 0.00677736186354
print (prob_result.prob("pos")) # Output: 0.993222638136

NameError: name 'classifier' is not defined

In [31]:
custom_review = 'handed in my uniform today miss you already'
custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_all_words(custom_review_tokens)
 
print (classifier.classify(custom_review_set)) # Output: pos
# Positive review correctly classified as positive
 
# probability result
prob_result = classifier.prob_classify(custom_review_set)
print (prob_result) # Output: <ProbDist with 2 samples>
print (prob_result.max()) # Output: pos
print (prob_result.prob("neg")) # Output: 0.00677736186354
print (prob_result.prob("pos")) # Output: 0.993222638136

NameError: name 'classifier' is not defined

In [193]:
# pos_reviews_list = tweet_data[tweet_data['sentiment']==1]

In [194]:
# pos_list = list(pos_reviews_list['text'])

In [195]:
# # pos_reviews = []
# pos_reviews_set = []
# # for words in pos_reviews:
# #     print(words)
# #     pos_reviews_set.append((bag_of_all_words(words), 'pos'))
# for i in pos_list:
#     if type(i) is not float:
#         pos_reviews_set.append((bag_of_all_words(i.split()), 'pos'))
# #         print(i.split())
# #         pos_reviews.append(i.split())
# # pos_reviews
# pos_reviews_set

In [196]:
# [i for i in pos_reviews_set]

In [197]:
# len(pos_reviews_set)

In [198]:
# neg_reviews_list = tweet_data[tweet_data['sentiment']==0]

In [199]:
# neg_list = list(neg_reviews_list['text'])

In [200]:
# # pos_reviews = []
# neg_reviews_set = []
# # for words in pos_reviews:
# #     print(words)
# #     pos_reviews_set.append((bag_of_all_words(words), 'pos'))
# for i in neg_list:
#     if type(i) is not float:
#         neg_reviews_set.append((bag_of_all_words(i.split()), 'neg'))
# #         print(i.split())
# #         pos_reviews.append(i.split())
# # pos_reviews
# neg_reviews_set

In [201]:
# len(neg_reviews_set)

In [202]:
# print (len(pos_reviews_set), len(neg_reviews_set)) # Output: (1000, 1000)
 
# # radomize pos_reviews_set and neg_reviews_set
# # doing so will output different accuracy result everytime we run the program
# from random import shuffle 
# shuffle(pos_reviews_set)
# shuffle(neg_reviews_set)

# test_set = pos_reviews_set[:int((0.2*len(pos_reviews_set)))] + neg_reviews_set[:int((0.2*len(neg_reviews_set)))]
# train_set = pos_reviews_set[int((0.2*len(pos_reviews_set))):] + neg_reviews_set[int((0.2*len(neg_reviews_set))):]
 
# print(len(test_set),  len(train_set))

In [203]:
# neg_reviews_set[:int((0.2*len(neg_reviews_set)))]

In [204]:
# for row in train_set:
#     if row[1] == 'neg':
#         print(row)

In [205]:
# from nltk import classify
# from nltk import NaiveBayesClassifier
 
# classifier = NaiveBayesClassifier.train(train_set)
 
# accuracy = classify.accuracy(classifier, test_set)
# print(accuracy)
 
# print (classifier.show_most_informative_features(10)) 

In [206]:
# from nltk.tokenize import word_tokenize
 
# custom_review = "I hated the film. It was a disaster. Poor direction, bad acting."
# custom_review_tokens = word_tokenize(custom_review)
# custom_review_set = bag_of_all_words(custom_review_tokens)
# print (classifier.classify(custom_review_set)) # Output: neg
# # Negative review correctly classified as negative
 
# # probability result
# prob_result = classifier.prob_classify(custom_review_set)
# print (prob_result) # Output: <ProbDist with 2 samples>
# print (prob_result.max()) # Output: neg
# print (prob_result.prob("neg")) # Output: 0.770612685688
# print (prob_result.prob("pos")) # Output: 0.229387314312
 
 
# custom_review = "It was a wonderful and amazing movie. I loved it. Best direction, good acting."
# custom_review_tokens = word_tokenize(custom_review)
# custom_review_set = bag_of_all_words(custom_review_tokens)
 
# print (classifier.classify(custom_review_set)) # Output: pos
# # Positive review correctly classified as positive
 
# # probability result
# prob_result = classifier.prob_classify(custom_review_set)
# print (prob_result) # Output: <ProbDist with 2 samples>
# print (prob_result.max()) # Output: pos
# print (prob_result.prob("neg")) # Output: 0.00677736186354
# print (prob_result.prob("pos")) # Output: 0.993222638136

In [207]:
# custom_review = 'handed in my uniform today miss you already'
# custom_review_tokens = word_tokenize(custom_review)
# custom_review_set = bag_of_all_words(custom_review_tokens)
 
# print (classifier.classify(custom_review_set)) # Output: pos
# # Positive review correctly classified as positive
 
# # probability result
# prob_result = classifier.prob_classify(custom_review_set)
# print (prob_result) # Output: <ProbDist with 2 samples>
# print (prob_result.max()) # Output: pos
# print (prob_result.prob("neg")) # Output: 0.00677736186354
# print (prob_result.prob("pos")) # Output: 0.993222638136