## Appendix 10 - Binary Classifier

**Method/Code:** Chapter 8 of 'Real-World Machine Learning' by Brink, Richards & Fetherolf

**Dataset:** http://thinknook.com/twitter-sentiment-analysis-training-corpus-dataset-2012-09-22/  - 1.5m tweets labelled with sentiment 0 for negative or 1 for positive

In [1]:
# Import training dataset 
import pandas as pd
d = pd.read_excel("sentimentanalysisdataset.xlsx")

In [3]:
d

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,1,0,Sentiment140,is so sad for my APL frie...,,,
1,2,0,Sentiment140,I missed the New Moon trail...,,,
2,3,1,Sentiment140,omg its already 7:30 :O,,,
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. I'...,,,
4,5,0,Sentiment140,i think mi bf is cheating on me!!! ...,,,
5,6,0,Sentiment140,or i just worry too much?,,,
6,7,1,Sentiment140,Juuuuuuuuuuuuuuuuussssst Chillin!!,,,
7,8,0,Sentiment140,Sunny Again Work Tomorrow :-| ...,,,
8,9,1,Sentiment140,handed in my uniform today . i miss you ...,,,
9,10,1,Sentiment140,hmmmm.... i wonder how she my number @-),,,


In [4]:
# CLEAN THE TEXT:

import numpy as np
import re
import string

# Write regex pattern to remove all punctuation
remove = string.punctuation
remove = remove + "“”‘’"
punct_pattern = r"[{}]".format(remove)

# Write second regex pattern to remove all punctuation except '#' and '@'
remove = remove.replace("#", "")
remove = remove.replace("@", "")
special_punct_pattern = r"[{}]".format(remove)

d["clean_sentiment_text"] = ""

for i in range(len(d.index)):
    # Remove URLs, remove punctuation, identify hashtags and mentions, capitalise all for description
    if type(d.at[i,"SentimentText"]) == str:
        d.at[i,"SentimentText"] = re.sub(r"http\S+", "", d.at[i,"SentimentText"])  # Remove URLs
        d.at[i,"SentimentText"] = re.sub(special_punct_pattern, " ", d.at[i,"SentimentText"])  # Remove punctuation
        d.at[i,"SentimentText"] = d.at[i,"SentimentText"].upper()  # Make upper case
        d.at[i,"SentimentText"] = re.sub(' +',' ',d.at[i,"SentimentText"])  # Remove extraneous spaces

In [5]:
# Check cleaning
print(d.head)

<bound method NDFrame.head of           ItemID  Sentiment SentimentSource  \
0              1          0    Sentiment140   
1              2          0    Sentiment140   
2              3          1    Sentiment140   
3              4          0    Sentiment140   
4              5          0    Sentiment140   
5              6          0    Sentiment140   
6              7          1    Sentiment140   
7              8          0    Sentiment140   
8              9          1    Sentiment140   
9             10          1    Sentiment140   
10            11          0    Sentiment140   
11            12          1    Sentiment140   
12            13          0    Sentiment140   
13            14          0    Sentiment140   
14            15          0    Sentiment140   
15            16          0    Sentiment140   
16            17          0    Sentiment140   
17            18          1    Sentiment140   
18            19          0    Sentiment140   
19            20          0   

In [6]:
# Split into 70% training data and 30% testing data
split = 0.7
d_train = d[:int(split*len(d))]
d_test = d[int ((1-split)*len(d)):]

In [7]:
# Import the vectoriser. As a count vectoriser, it will use a simple
# Word count method for extracting features (generating a 'bag of words'
# From each row of the dataset)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [8]:
# Fit the dictionary and generates training set features
features = vectorizer.fit_transform(d_train.SentimentText.values.astype('U'))
# Generate features for the testing set
test_features = vectorizer.transform(d_test.SentimentText.values.astype('U'))

In [9]:
# Visualise a subset of the bag of words created. 
i=10000
j=10
words = vectorizer.get_feature_names()[i:i+10]
pd.DataFrame(features[j:j+7,i:i+10].todense(),columns=words)

Unnamed: 0,accent,accente,accentless,accents,accenttheparty,accentuate,accentuated,accentuates,accentuating,accentuations
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0


In [10]:
# Use a Naive Bayes algorithm, specifically the multinomial, to build 
# The model from the features. Naive Baves algorithms work well for 
# Classification with sparse natural language processing features.

from sklearn.naive_bayes import MultinomialNB

model1 = MultinomialNB()
model1.fit(features, d_train.Sentiment)
pred1 = model1.predict_proba(test_features)

In [12]:
# We can test the model manually here by altering the string below. A positive sentence
# should come out as 1, a negative sentence should come out as 0

SentimentText = "TRUMP IS AWFUL"
print(model1.predict(vectorizer.transform([SentimentText]))[0])

0


In [13]:
# Test for accuracy (with testing data):
d_test["algosentiment"] = ""
d_test_sample = d_test.sample(1000)
d_test_sample = d_test_sample.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [14]:
# Run the model over the test data, putting its sentiment predicter in the algosentiment field
for i in range(len(d_test_sample)):
    ([d_test_sample.at[i, "algosentiment"]]) = model1.predict(vectorizer.transform([d_test_sample.at[i, "SentimentText"]])[0])

In [15]:
# Compare the predicted sentiment from the model with the actual sentiment originally marked in dataset

countOK = 0
countNotOK= 0

for i in range(len(d_test_sample)):
    if ([d_test_sample.at[i, "algosentiment"]]) == ([d_test_sample.at[i, "Sentiment"]]):
        countOK+=1
    else:
        countNotOK+=1
        
print(countOK, countNotOK)

833 167


We can see above that the model is 83% accurate when working on the test data from the same dataset it was trained from. Now to run it over our own dataset of SOTU tweets:

In [17]:
# Import our dataset
tweets = pd.read_excel("03sotu_with_states.xlsx", usecols = "J")

# Get a random sample of 1000 tweets from it
tweetsample = tweets.sample(1000)
tweetsample = tweetsample.reset_index()

In [18]:
# Run the model over the sample of our tweets, put the assigned sentiment in a new column
tweetsample["algosentiment"] = ""
for i in range(1,1000):
    tweetsample.at[i, "algosentiment"] = model1.predict(vectorizer.transform([tweetsample.at[i, "text"]]))[0]

In [19]:
# View results
tweetsample

Unnamed: 0,index,text,algosentiment
0,98658,TRUMP SINGLES OUT AN AMERICAN HERO AMAZING THA...,
1,64637,TRUMP WANTS TO USE HIS TO SELL OFF OUR ROADS A...,1
2,56364,IT S ADDICTION NOT EDICTION IT S SCOURGE NOT S...,0
3,85784,THAT LADY ON HER CELLPHONE LOL,1
4,54806,WE CAN T BLOW UP THE WORLD ENOUGH TIMES OVER F...,1
5,187472,MELANIA TRUMP ALWAYS LOOKS LIKE SHE S DAYDREAM...,1
6,78474,WHAT ABOUT LAS VEGAS YOU PIECE OF PURE FILTH,1
7,129729,I 100 AGREE WITH TRUMP ON PAID FAMILY LEAVE AN...,1
8,174286,ON THE VA I WILL NO STOP UNTIL OUR VETERANS AR...,0
9,188435,I M SO BORED RIGHT NOW APPLE PIE AND FLAGS REA...,0
