# Twitter Sentiment Analysis #

## Set-up ##

In [89]:
## Install and import the Natural Language Toolkit.

#pip install nltk 
import nltk
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk import FreqDist
stop_words = stopwords.words('english') ## We want to use English stop words.

In [90]:
## Dpwnload the Punkt package for tokenising.
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [91]:
## Download wordnet and averaged perceptron trigger for normalising/lemmatising the text. Import pos_tag to determine word context.
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [92]:
## Download the 'twitter_samples' dataset and stop words.
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [93]:
## Import the packages we need to remove noise.
import re, string

In [94]:
## Import random to shuffle.
import random

In [95]:
## Import classify and Naive Bayes for the model.
from nltk import classify
from nltk import NaiveBayesClassifier

In [96]:
## Import Beautiful Soup to scrape Facbook messages

import pandas as pd
from bs4 import BeautifulSoup 
import collections

## Tokenising ##

In [97]:
## Create variables for the positive tweets, negative tweets and text
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')

In [98]:
## Tokenise the tweets
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

## Normalising and Removing Noise ##

In [99]:
## Define a function to lemmatize the tweets and remove noise. This reduces words to their root form and gets rid of links and useless words. 
def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = [] # Initialise the output vecotr

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub(r'^https?:\/\/.*[\r\n]*','', token) # Removes links.
        token = re.sub("(@[A-Za-z0-9_]+)","", token) # Removes @ mentions.

        if tag.startswith('NN'):
            pos = 'n' # Assign nouns as nouns.
        elif tag.startswith('VB'):
            pos = 'v' ## Assign verbs as verbs.
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer() # Define lemmatizer as the wordnet lemmatizer.
        token = lemmatizer.lemmatize(token, pos) # Reduce the words to their root words.

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words: # Gets rid of the empty tweets and any punctuation.
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [100]:
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) ## Clean the positive tweets.

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) ## Clean the negative tweets.

## Determining Word Density ##

In [101]:
## Define a function to return a list of all the words from tweets.

def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list: # For every tweet in the defined list.
        for token in tokens: # AND for every word in every element of the list.
            yield token # Return the word.

In [102]:
## Use the above function to return all positive and negative tweets words.

all_pos_words = get_all_words(positive_cleaned_tokens_list)
all_neg_words = get_all_words(negative_cleaned_tokens_list)

In [103]:
## Make a frequency distribution to find most common words.

freq_dist_pos = FreqDist(all_pos_words)
freq_dist_neg = FreqDist(all_neg_words)

## Prepare data for modelling ##

In [104]:
## Convert the values to a dictionary for us in naive bayes classification.

def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

In [105]:
## Make the lists of model ready data.

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

In [106]:
## Split the data in to train and test data.

positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model] # Label the positive data.

negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in negative_tokens_for_model] # Label the negative data.

dataset = positive_dataset + negative_dataset # Combine the data sets.

random.shuffle(dataset) # Shuffle the data so theres no natural ordering.

train_data = dataset[:7000] # First 7000 entries for train.
test_data = dataset[7000:] # Final 3000 for testing.

## Build the test model ##

In [107]:
## Train the model!
classifier = NaiveBayesClassifier.train(train_data)

In [108]:
## Return the accuracy and the words that are most useful in determing the sentiment.
print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.9933333333333333
Most Informative Features
                      :( = True           Negati : Positi =   2035.8 : 1.0
                      :) = True           Positi : Negati =   1669.1 : 1.0
                  arrive = True           Positi : Negati =     29.1 : 1.0
                followed = True           Negati : Positi =     26.3 : 1.0
                     sad = True           Negati : Positi =     24.9 : 1.0
                follower = True           Positi : Negati =     19.1 : 1.0
               community = True           Positi : Negati =     18.1 : 1.0
                     via = True           Positi : Negati =     16.3 : 1.0
                 welcome = True           Positi : Negati =     15.4 : 1.0
                    luck = True           Positi : Negati =     14.0 : 1.0
None


In [134]:
## Test out the model with custom tweets!
custom_tweet = "Test message"
custom_tokens = remove_noise(nltk.word_tokenize(custom_tweet))

print(classifier.classify(dict([token, True] for token in custom_tokens)))

Negative


# Using the Model on Facebook messages #

## Scrape the Facebook messages ##

In [110]:
## Initialise the file to scrape.
url = 'D:\User\*File_Location*' + str(1) + '.html'
soup = BeautifulSoup(open(url, encoding = 'utf8').read(), 'html.parser')

In [111]:
## Initialise all pages of the scrape.
for i in range(1,5):
    url = 'D:\User\*File_Location*' + str(i) + '.html' # Location to scrape from.
    page = open(url, encoding = 'utf8') # Open the file with utf8 encoding.
    soup[i] = BeautifulSoup(page.read(), 'html.parser') # Create a BS object to scrape from.

In [135]:
## Scrape the messages.
Texts = [] # Initialise the Texts vector.
j = 0
for i in range(1,5): # Iterates over every page.
    for div in soup[i].find_all('div', class_ = '_3-96 _2let'): ## Returns the message.
        Texts.append(div.text) ## Saves output to Texts
        j += 1

In [113]:
Texter = []
for i in range(1,5):
    for div in soup[i].find_all('div', class_ = '_3-96 _2pio _2lek _2lel'): ## Returns the person that sent the text
        Texter.append(div.text) ## Saves output to Texter

In [114]:
mess_date = []
for i in range(1,5):
    for div in soup[i].find_all('div', class_ = '_3-94 _2lem'): ## Returns the person that sent the text
        mess_date.append(div.text) ## Saves output to Texter

## Run the messages through the model ##

In [136]:
text_tokens = []
for msg in Texts:
    text_tokens.append(nltk.word_tokenize(msg)) # Tokenize the messages.

In [137]:
txt_cleaned_tokens_list = []

for tokens in text_tokens:
    txt_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) ## Clean the messages.

In [138]:
texts_for_analysis = get_tweets_for_model(Texts)

msg_clas = []
for txts in texts_for_analysis:
    msg_clas.append(classifier.classify(dict([token, True] for token in txts))) ## Run the messages through a calssifier.

## Exploratory Analysis ##

In [139]:
FullTexts = list(zip(Texter,Texts, mess_date, msg_clas))
Mess_df = pd.DataFrame(FullTexts, columns = ['Texter','Texts', 'mess_date', 'msg_clas']) # Add the sentiment to a dataframe.

In [140]:
## Determine the number of positive and negative messages.

num_pos_msg = 0
for msg in msg_clas:
    if msg == 'Positive':
        num_pos_msg += 1

num_neg_msg = 0
for msg in msg_clas:
    if msg == 'Negative':
        num_neg_msg += 1

In [141]:
num_pos_msg/(num_pos_msg + num_neg_msg)*100 # Determine percentage of positive messages.

64.18524183235965

In [149]:
Mess_df.to_csv('Messages_Sentiment.csv') # Svae the file to a csv file.