# Building ML Model Naive Bayes
## get the data loaded

In [4]:
import nltk

In [5]:
import pandas as pd

In [6]:
positive_tweets = pd.read_feather('/content/drive/MyDrive/Colab Notebooks/Datasets/Sentiment 140/data/sentiment 140 positive.feather')

In [7]:
negative_tweets = pd.read_feather('/content/drive/MyDrive/Colab Notebooks/Datasets/Sentiment 140/data/sentiment 140 negative.feather')

## 10% to produce the output in notebook, similar percentage for whole set

In [8]:
positive_tweets = positive_tweets.sample(frac=.1)
negative_tweets = negative_tweets.sample(frac=.1)

In [9]:
positive_tweets.describe()

Unnamed: 0,sentiment
count,80000.0
mean,1.0
std,0.0
min,1.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [10]:
negative_tweets.describe()

Unnamed: 0,sentiment
count,80000.0
mean,0.0
std,0.0
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,0.0


## Tokenize the data

In [11]:
from nltk.tokenize import TweetTokenizer

In [12]:
tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True)

In [13]:
tokens_positive = [tweet_tokenizer.tokenize(p) for p in positive_tweets['text']]

In [14]:
tokens_negative = [tweet_tokenizer.tokenize(p) for p in negative_tweets['text']]

In [15]:
from nltk.tag import pos_tag
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [16]:
tags_positive = [pos_tag(p) for p in tokens_positive]
tags_negative = [pos_tag(n) for n in tokens_negative]

In [17]:
tags_positive[0]

[('that', 'DT'),
 ('must', 'MD'),
 ('be', 'VB'),
 ('pretty', 'JJ'),
 ('cool', 'NN')]

## Lemmatize the words

In [18]:
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [19]:
def tag_generalizer(tag): #a function to combine all nouns, verbs, etc. to a single tag
    '''
    Take a tag and return a type.
    return 'n' for noun, 'v' for verb, and 'a' for any
    '''
    if tag.startswith('NN'):
        return 'n'
    elif tag.startswith('VB'):
        return 'v'
    else:
        return 'a'

lemmatizer = WordNetLemmatizer()
lemmatized_positive = [[lemmatizer.lemmatize(word, tag_generalizer(tag)) for (word, tag) in tags] for tags in tags_positive]
lemmatized_negative = [[lemmatizer.lemmatize(word, tag_generalizer(tag)) for (word, tag) in tags] for tags in tags_negative]
print(f"Sample positive: {positive_tweets.head(1)['text']}")
print(f"Lemmatized sample: {lemmatized_positive[10]}")
print(f"Tags: {tags_positive[10]}")

Sample positive: 449016    @kristenstewart9 that must be pretty cool 
Name: text, dtype: object
Lemmatized sample: ['work', 'work', 'work', 'work', '.', 'finally', 'off', 'tomorrow', '.', 'need', 'to', 'do', 'to', 'much', 'stuff', '.']
Tags: [('working', 'VBG'), ('working', 'VBG'), ('working', 'VBG'), ('working', 'VBG'), ('.', '.'), ('finally', 'RB'), ('off', 'IN'), ('tomorrow', 'NN'), ('.', '.'), ('need', 'NN'), ('to', 'TO'), ('do', 'VB'), ('to', 'TO'), ('much', 'VB'), ('stuff', 'NN'), ('.', '.')]


##Prune punctuation and stopwords

In [20]:
import re
from string import punctuation
from nltk.corpus import stopwords
nltk.download('stopwords')
stops = set(stopwords.words('english'))
extra_stops = (["...",".."])
stops.update(extra_stops)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [21]:
def is_noise(word):
    pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|(@[A-Za-z0-9_]+)' # punctuation removal
    return word in punctuation \
        or word.lower() in stops \
        or re.search(pattern, word, re.IGNORECASE) != None

In [22]:
denoised_positive = [[p.lower() for p in word_list if not is_noise(p)] for word_list in lemmatized_positive]
denoised_negative = [[p.lower() for p in word_list if not is_noise(p)] for word_list in lemmatized_negative]

In [23]:
print(f"Positive tweet: {positive_tweets.head(1)['text']}\n")
print(f"Denoised: {denoised_positive[0]}")

Positive tweet: 449016    @kristenstewart9 that must be pretty cool 
Name: text, dtype: object

Denoised: ['must', 'pretty', 'cool']


##Get final tokenized lists, add True for ML

In [24]:
def tweets_for_model(tokens_list):

    for tokens in tokens_list:
        yield dict([token, True] for token in tokens)

positive_tokens_for_model = tweets_for_model(denoised_positive)
negative_tokens_for_model = tweets_for_model(denoised_negative)

##Peep at most common words

In [25]:
from nltk import FreqDist

In [26]:
def get_all_words(tokens_list):
      for tokens in tokens_list:
        for token in tokens:
            yield token

all_pos_words = get_all_words(denoised_positive)
all_neg_words = get_all_words(denoised_negative)

freq_dist_pos = FreqDist(all_pos_words)
freq_dist_neg = FreqDist(all_neg_words)

print(f"The 100 most common words in a set of positive tweets: {freq_dist_pos.most_common(100)}")
print(f"The 100 most common words in a set of negative tweets: {freq_dist_neg.most_common(100)}")

The 100 most common words in a set of positive tweets: [('get', 7706), ('good', 7404), ('go', 6802), ("i'm", 5545), ('love', 5521), ('day', 5424), ('like', 3986), ('lol', 3738), ('time', 3445), ('thanks', 3412), ('u', 3101), ('see', 3060), ('today', 2953), ('know', 2948), ('make', 2854), ('work', 2801), ('new', 2758), ('one', 2733), ('think', 2715), ('great', 2565), ('watch', 2524), ('back', 2441), ('night', 2364), ('well', 2320), ('look', 2231), ('come', 2213), ('haha', 2208), ('happy', 2120), ('morning', 2091), ('twitter', 2017), ('im', 2003), ('hope', 1976), ('wait', 1901), ('really', 1871), ('fun', 1862), ('say', 1835), ('oh', 1703), ('much', 1694), ('2', 1679), ('home', 1675), ('need', 1669), ('want', 1659), ('nice', 1658), ('thank', 1584), ("i'll", 1547), ('follow', 1463), ('still', 1450), ('tomorrow', 1440), ('take', 1423), ('hey', 1390), ("can't", 1389), ('awesome', 1372), ('way', 1365), ('right', 1333), ('tonight', 1325), ('would', 1323), ('yes', 1315), ('yeah', 1297), ('last'

##Create test and train sets

In [27]:
from sklearn.model_selection import train_test_split
import random

In [28]:
def add_true_for_training(tokens_list):
  for tweet_tokens in tokens_list:
        yield dict([token, True] for token in tweet_tokens)

positive_tokens_for_model = add_true_for_training(denoised_positive)
negative_tokens_for_model = add_true_for_training(denoised_negative)

In [29]:
TEST_SIZE = .10
positive_dataset = [(tweet_dict, "pos") for tweet_dict in positive_tokens_for_model]
negative_dataset = [(tweet_dict, "neg") for tweet_dict in negative_tokens_for_model]
dataset = positive_dataset + negative_dataset
random.shuffle(dataset)
# print(dataset)
train_dataset, test_dataset = train_test_split(dataset, test_size=TEST_SIZE)

##Train the Classifier (Naive Bayes)

In [30]:
from nltk import classify
from nltk import NaiveBayesClassifier

In [31]:
classifier = NaiveBayesClassifier.train(train_dataset)

## Evaluate Results

In [32]:


print(f"Training accuracy: {classify.accuracy(classifier, train_dataset)}")
print(f"Testing accuracy: {classify.accuracy(classifier, test_dataset)}")
print(classifier.show_most_informative_features(50))

Training accuracy: 0.8285625
Testing accuracy: 0.7509375
Most Informative Features
                     3gs = True              neg : pos    =     32.4 : 1.0
                   boooo = True              neg : pos    =     30.4 : 1.0
           disappointing = True              neg : pos    =     29.0 : 1.0
               miserable = True              neg : pos    =     26.6 : 1.0
           #followfriday = True              pos : neg    =     23.1 : 1.0
                   wreck = True              neg : pos    =     21.7 : 1.0
                  unable = True              neg : pos    =     20.4 : 1.0
                 relieve = True              pos : neg    =     20.3 : 1.0
                     sad = True              neg : pos    =     19.4 : 1.0
              withdrawal = True              neg : pos    =     17.7 : 1.0
               infection = True              neg : pos    =     16.5 : 1.0
               emergency = True              neg : pos    =     16.4 : 1.0
                 