# Module 4 Project: NLP Model to Guage Tweet Sentiment

In [44]:
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import FreqDist, regexp_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import string
import pandas as pd

In [3]:
# Read in CSV file ignoring and dropping non-utf8 characters
csv_file = open('judge-1377884607_tweet_product_company.csv', encoding='utf-8', errors='ignore')

In [4]:
df = pd.read_csv(csv_file)

In [5]:
# Rename columns with really long names
df.columns = ['tweet_text', 'emotion_directed_at', 'emotion_directed_at_brand']

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
tweet_text                   9092 non-null object
emotion_directed_at          3291 non-null object
emotion_directed_at_brand    9093 non-null object
dtypes: object(3)
memory usage: 213.2+ KB


In [7]:
df.head()

Unnamed: 0,tweet_text,emotion_directed_at,emotion_directed_at_brand
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [8]:
df.tweet_text

0       .@wesley83 I have a 3G iPhone. After 3 hrs twe...
1       @jessedee Know about @fludapp ? Awesome iPad/i...
2       @swonderlin Can not wait for #iPad 2 also. The...
3       @sxsw I hope this year's festival isn't as cra...
4       @sxtxstate great stuff on Fri #SXSW: Marissa M...
                              ...                        
9088                        Ipad everywhere. #SXSW {link}
9089    Wave, buzz... RT @mention We interrupt your re...
9090    Google's Zeiger, a physician never reported po...
9091    Some Verizon iPhone customers complained their...
9092    ϡ_ʋ΋ҋ⋁__RT @mention Google Tests Check-in Offe...
Name: tweet_text, Length: 9093, dtype: object

# Corpus Statistics Section

## Preprocessing data

In [9]:
# TODO: Change Regex because tokens aren't correct
# Pattern to capture words with apostrophes as one token
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"

# Casting tweet_text to string or the tokenization doesn't work
df['tweet_text'] = df['tweet_text'].astype('str')

# Tokenizing tweets and inserting into df
df['raw_tokens'] = df['tweet_text'].map(lambda x: regexp_tokenize(x, pattern))

# Lowercase all tokens
df['raw_tokens'] = df['raw_tokens'].map(lambda x: [y.lower() for y in x])

In [10]:
# Verify that the tokenization worked correctly
df[['tweet_text', 'raw_tokens']].loc[:10]

Unnamed: 0,tweet_text,raw_tokens
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,"[wesley, i, have, a, g, iphone, after, hrs, tw..."
1,@jessedee Know about @fludapp ? Awesome iPad/i...,"[jessedee, know, about, fludapp, awesome, ipad..."
2,@swonderlin Can not wait for #iPad 2 also. The...,"[swonderlin, can, not, wait, for, ipad, also, ..."
3,@sxsw I hope this year's festival isn't as cra...,"[sxsw, i, hope, this, year's, festival, isn't,..."
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,"[sxtxstate, great, stuff, on, fri, sxsw, maris..."
5,@teachntech00 New iPad Apps For #SpeechTherapy...,"[teachntech, new, ipad, apps, for, speechthera..."
6,,[nan]
7,"#SXSW is just starting, #CTIA is around the co...","[sxsw, is, just, starting, ctia, is, around, t..."
8,Beautifully smart and simple idea RT @madebyma...,"[beautifully, smart, and, simple, idea, rt, ma..."
9,Counting down the days to #sxsw plus strong Ca...,"[counting, down, the, days, to, sxsw, plus, st..."


In [11]:
def bag_of_words_from_series(series):
    return [token for tweet in series for token in tweet]

bag_of_words_raw = bag_of_words_from_series(df.raw_tokens)

tweets_freq_dist = FreqDist(bag_of_words_raw)

In [12]:
tweets_freq_dist.most_common(25)

[('sxsw', 9649),
 ('mention', 7124),
 ('the', 4439),
 ('link', 4313),
 ('to', 3607),
 ('at', 3107),
 ('ipad', 2997),
 ('rt', 2970),
 ('for', 2550),
 ('google', 2484),
 ('a', 2363),
 ('apple', 2247),
 ('in', 1977),
 ('of', 1715),
 ('is', 1713),
 ('quot', 1696),
 ('and', 1639),
 ('iphone', 1586),
 ('store', 1490),
 ('on', 1335),
 ('up', 1273),
 ('i', 1143),
 ('new', 1091),
 ('austin', 968),
 ('you', 950)]

Clearly we can see that there is more preprocessing to do on our dataset. At first glance I can tell that there are stopwords that need to be removed.

In [13]:
# Remove stopwords
stopwords_list = stopwords.words('english')
stopwords_list += list(string.punctuation)

df['raw_tokens_stopped'] = df['raw_tokens'].map(lambda x: [y for y in x if y not in stopwords_list])

In [14]:
# Now look at frequency distribution of new tokens
bag_of_words_stopped = bag_of_words_from_series(df.raw_tokens_stopped)

tweets_freq_dist = FreqDist(bag_of_words_stopped)

In [15]:
freq_dist_top_25 = tweets_freq_dist.most_common(25)
freq_dist_top_25

[('sxsw', 9649),
 ('mention', 7124),
 ('link', 4313),
 ('ipad', 2997),
 ('rt', 2970),
 ('google', 2484),
 ('apple', 2247),
 ('quot', 1696),
 ('iphone', 1586),
 ('store', 1490),
 ('new', 1091),
 ('austin', 968),
 ('amp', 836),
 ('app', 825),
 ('circles', 676),
 ('social', 670),
 ('launch', 653),
 ('android', 597),
 ('pop', 596),
 ('today', 574),
 ('network', 467),
 ('via', 436),
 ('line', 411),
 ('get', 395),
 ('free', 390)]

## What is the total vocabulary size of our dataset?

In [16]:
len(bag_of_words_stopped)

112020

## What is the normalized word frequency (percentage of total words)?

In [17]:
print('Normalized Word Frequencies')
for word in freq_dist_top_25:
    normalized_frequency = word[1] / len(bag_of_words_stopped)
    print(f'{word[0]:15}{normalized_frequency:.3}')

Normalized Word Frequencies
sxsw           0.0861
mention        0.0636
link           0.0385
ipad           0.0268
rt             0.0265
google         0.0222
apple          0.0201
quot           0.0151
iphone         0.0142
store          0.0133
new            0.00974
austin         0.00864
amp            0.00746
app            0.00736
circles        0.00603
social         0.00598
launch         0.00583
android        0.00533
pop            0.00532
today          0.00512
network        0.00417
via            0.00389
line           0.00367
get            0.00353
free           0.00348


# Making Bigrams from tweets to find common word phrases in our corpus

1.7  Creating Bigrams
Knowing individual word frequencies is somewhat informative, but in practice, some of these tokens are actually parts of larger phrases that should be treated as a single unit. Let's create some bigrams, and see which combinations of words are most telling.

In the cell below:

We'll begin by aliasing a particularly long method name to make it easier to call. Store nltk.collocations.BigramAssocMeasures() inside of the variable bigram_measures
Next, we'll need to create a finder. Pass macbeth_words_stopped into BigramCollocationFinder.from_words() and assign the result to macbeth_finder
Once we have a finder, we can use it to compute bigram scores, so we can see the combinations that occur most frequently. Call the macbeth_finder object's score_ngrams() method and pass in bigram_measures.raw_freq as the input
Display first 50 elements in the macbeth_scored list to see the 50 most common bigrams in macbeth

In [18]:
# from nltk.collocations import *
tweet_bigram_finder = BigramCollocationFinder.from_words(bag_of_words_stopped)

In [19]:
tweet_bigram_scored = tweet_bigram_finder.score_ngrams(BigramAssocMeasures().raw_freq)

In [20]:
tweet_bigram_scored[:50]

[(('rt', 'mention'), 0.02578111051597929),
 (('sxsw', 'link'), 0.008471701481878237),
 (('link', 'sxsw'), 0.007757543295840028),
 (('sxsw', 'rt'), 0.00604356364934833),
 (('mention', 'mention'), 0.005757900374933048),
 (('mention', 'sxsw'), 0.0054454561685413315),
 (('apple', 'store'), 0.005311551508659168),
 (('sxsw', 'mention'), 0.004686663095875737),
 (('link', 'rt'), 0.004570612390644528),
 (('mention', 'google'), 0.004222460274950901),
 (('social', 'network'), 0.004061774683092305),
 (('new', 'social'), 0.0037225495447241563),
 (('via', 'mention'), 0.00315122299589359),
 (('mention', 'rt'), 0.003070880199964292),
 (('store', 'sxsw'), 0.003026245313336904),
 (('pop', 'store'), 0.0029905374040349935),
 (('google', 'launch'), 0.0029280485627566507),
 (('ipad', 'sxsw'), 0.002919121585431173),
 (('network', 'called'), 0.002910194608105695),
 (('sxsw', 'apple'), 0.002910194608105695),
 (('called', 'circles'), 0.0028477057668273523),
 (('austin', 'sxsw'), 0.002829851812176397),
 (('sxsw'

# Using Mutual Information Scores

In [21]:
tweet_pmi_finder = BigramCollocationFinder.from_words(bag_of_words_stopped)
tweet_pmi_finder.apply_freq_filter(5)
tweet_pmi_scored = tweet_pmi_finder.score_ngrams(BigramAssocMeasures().pmi)

In [22]:
tweet_pmi_scored

[(('jc', 'penney'), 14.188434307113564),
 (('knitted', 'staircase'), 14.188434307113564),
 (('naomi', 'campbell'), 14.188434307113564),
 (('pauly', 'celebs'), 14.188434307113564),
 (('aron', 'pilhofer'), 13.96604188577712),
 (('alternate', 'routes'), 13.966041885777116),
 (('charlie', 'sheen'), 13.966041885777116),
 (('follower', 'swarm'), 13.966041885777116),
 (('lynn', 'teo'), 13.966041885777116),
 (('sheen', 'goddesses'), 13.966041885777116),
 (('swarm', 'ensues'), 13.966041885777116),
 (('wi', 'fi'), 13.966041885777116),
 (('policy', 'imacs'), 13.925399901279771),
 (('cameron', 'sinclair'), 13.773396807834722),
 (('etch', 'sketch'), 13.773396807834722),
 (('staircase', 'attendance'), 13.773396807834718),
 (('likeability', 'virgin'), 13.703007479943324),
 (('barton', 'hollow'), 13.60347180639241),
 (('khoi', 'vinh'), 13.60347180639241),
 (('launchrock', 'comp'), 13.60347180639241),
 (('participating', 'launchrock'), 13.60347180639241),
 (('bloody', 'banality'), 13.603471806392406),


# Text Classification - Lab section

In [25]:
# recreate documents after cleaning/processing step
df['cleaned_doc'] = df.raw_tokens_stopped.map(lambda x: ' '.join(x))

In [33]:
# Implement train/test split of dataset
X_train, X_test, y_train, y_test = train_test_split(
    df.cleaned_doc, df.emotion_directed_at_brand, test_size=0.20, random_state=42)

# Instantiate vectorizer object
vectorizer = TfidfVectorizer()

tf_idf_X_train = vectorizer.fit_transform(X_train)
tf_idf_X_test = vectorizer.transform(X_test)

In [34]:
tf_idf_X_train.shape

(7274, 8291)

In [35]:
non_zero_cols = tf_idf_X_train.nnz / float(tf_idf_X_train.shape[0])
print("Average Number of Non-Zero Elements in Vectorized Articles: {}".format(non_zero_cols))

percent_sparse = 1 - (non_zero_cols / float(tf_idf_X_train.shape[1]))
print('Percentage of columns containing 0: {}'.format(percent_sparse))

Average Number of Non-Zero Elements in Vectorized Articles: 11.47951608468518
Percentage of columns containing 0: 0.9986154244259214


In [39]:
nb_classifier = MultinomialNB()
rf_classifier = RandomForestClassifier(n_estimators=100)

In [42]:
nb_classifier.fit(tf_idf_X_train, y_train)
nb_train_preds = nb_classifier.predict(tf_idf_X_train)
nb_test_preds = nb_classifier.predict(tf_idf_X_test)

In [47]:
rf_classifier.fit(tf_idf_X_train, y_train)
rf_train_preds = rf_classifier.predict(tf_idf_X_train)
rf_test_preds = rf_classifier.predict(tf_idf_X_test)

In [49]:
nb_train_score = accuracy_score(y_train, nb_train_preds)
nb_test_score = accuracy_score(y_test, nb_test_preds)
rf_train_score = accuracy_score(y_train, rf_train_preds)
rf_test_score = accuracy_score(y_test, rf_test_preds)

print("Multinomial Naive Bayes")
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(nb_train_score, nb_test_score))
print("")
print('-'*70)
print("")
print('Random Forest')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(rf_train_score, rf_test_score))

Multinomial Naive Bayes
Training Accuracy: 0.7435 		 Testing Accuracy: 0.646

----------------------------------------------------------------------

Random Forest
Training Accuracy: 0.9946 		 Testing Accuracy: 0.6712
