## Sentiment Analysis of Tweets

### Introduce project ....

In [1]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.probability import FreqDist # looks at how frequent words are used
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import train_test_split
# from matplotlib import cm
from sklearn.ensemble import RandomForestClassifier #
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
import string, re
string.punctuation
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

#### Import and Clean Dataset

In [2]:
# read in tweets
# src: https://data.world/crowdflower/brands-and-product-emotions
df = pd.read_csv('data/tweets.csv', encoding = "ISO-8859-1")
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [3]:
# rename columns
df.rename(columns ={'tweet_text': 'tweet',
                    'emotion_in_tweet_is_directed_at': 'product_',
                    'is_there_an_emotion_directed_at_a_brand_or_product': 'emotion'}, inplace = True)

In [4]:
# could make the nulls for products into 'Other' category

# put null values into other category
df['product_'] = np.where(df['product_'].isnull(), 'Unknown', df['product_'])

In [5]:
# drop the one null tweet
df.dropna(inplace=True)

In [6]:
# target variable
# I CANT TELL whattt
df.emotion.value_counts()

No emotion toward brand or product    5388
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: emotion, dtype: int64

In [7]:
# assign target varible numbers
df['emotion'] = np.where(df['emotion'] == "I can't tell", 2, df['emotion'])
df['emotion'] = np.where(df['emotion'] == 'No emotion toward brand or product', 3, df['emotion'])
df['emotion'] = np.where(df['emotion'] == 'Positive emotion', 1, df['emotion'])
df['emotion'] = np.where(df['emotion'] == 'Negative emotion', 0, df['emotion'])

In [8]:
# update data type
df['emotion'] = df['emotion'].astype('int64')

In [9]:
# # tokenize words in tweets using regex
# tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')

# items = []
# for item in df['tweet']:
#     item = tokenizer.tokenize(item)
#     items.append(item)
    
# df['tweet'] = items

In [10]:
# Create stop wordset
# stop_words=set(stopwords.words("english"))

In [11]:
## setting stopwords and punctuations
stop_words=stopwords.words("english")
stop_words += list(string.punctuation)
stop_words += ['...','u','w','2',"i'm",'via',"we're",'6','3','hey']
# print(stop_words)
sw_set = set(stop_words)
# sw_set

In [12]:
def process_tweet(tweet):
    tokenizer = RegexpTokenizer(r"(iPad\s2|[a-zA-Z0-9-]+'?\w+)")

    tokens = tokenizer.tokenize(tweet)
    sw_removed = [token.lower().replace(" ","") for token in tokens if token.lower() not in sw_set]
    return sw_removed

In [13]:
tweet = 'i went to the store'
process_tweet(tweet)

['went', 'store']

In [14]:
df

Unnamed: 0,tweet,product_,emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,1
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,1
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,0
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,1
...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,1
9089,"Wave, buzz... RT @mention We interrupt your re...",Unknown,3
9090,"Google's Zeiger, a physician never reported po...",Unknown,3
9091,Some Verizon iPhone customers complained their...,Unknown,3


In [15]:
items = []
for item in df['tweet']:
    item = process_tweet(item)
    items.append(item)
df['tweet'] = items

In [16]:
df

Unnamed: 0,tweet,product_,emotion
0,"[wesley83, 3g, iphone, hrs, tweeting, rise_aus...",iPhone,0
1,"[jessedee, know, fludapp, awesome, ipad, iphon...",iPad or iPhone App,1
2,"[swonderlin, wait, ipad2, also, sale, sxsw]",iPad,1
3,"[sxsw, hope, year's, festival, crashy, year's,...",iPad or iPhone App,0
4,"[sxtxstate, great, stuff, fri, sxsw, marissa, ...",Google,1
...,...,...,...
9088,"[ipad, everywhere, sxsw, link]",iPad,1
9089,"[wave, buzz, rt, mention, interrupt, regularly...",Unknown,3
9090,"[google's, zeiger, physician, never, reported,...",Unknown,3
9091,"[verizon, iphone, customers, complained, time,...",Unknown,3


### Stemming

In [17]:
# ps = PorterStemmer()

# stemmed_tweets=[]
# for row in df['tweet']:
#     new_row = []
#     for w in row:
#         new_row.append(ps.stem(w))
#     stemmed_tweets.append(new_row)
        
# df['stemmed_tweets'] =  stemmed_tweets

In [18]:
# df

### Lemmatization

In [19]:
lemmatizer = WordNetLemmatizer() 

lemmatizer_tweets=[]
for row in df['tweet']:
    new_row = []
    for w in row:
        new_row.append(lemmatizer.lemmatize(w))
    lemmatizer_tweets.append(new_row)
        
df['lemmatizer_tweets'] =  lemmatizer_tweets

In [20]:
df['lemmatizer_tweets']

0       [wesley83, 3g, iphone, hr, tweeting, rise_aust...
1       [jessedee, know, fludapp, awesome, ipad, iphon...
2             [swonderlin, wait, ipad2, also, sale, sxsw]
3       [sxsw, hope, year's, festival, crashy, year's,...
4       [sxtxstate, great, stuff, fri, sxsw, marissa, ...
                              ...                        
9088                       [ipad, everywhere, sxsw, link]
9089    [wave, buzz, rt, mention, interrupt, regularly...
9090    [google's, zeiger, physician, never, reported,...
9091    [verizon, iphone, customer, complained, time, ...
9092    [rt, mention, google, test, check-in, offer, s...
Name: lemmatizer_tweets, Length: 9092, dtype: object

In [21]:
# Concat words in tweet series
new_lem_tweets = []
for item in df['lemmatizer_tweets']:
    obj = ''
    for w in item:
        obj = obj + w + ' '
    new_lem_tweets.append(obj)

df['lemmatizer_tweets'] = new_lem_tweets

In [22]:
## using this as the data target had better performance

tf=TfidfVectorizer()
text_tf= tf.fit_transform(df['lemmatizer_tweets'])
text_tf.data

array([0.04098343, 0.30909751, 0.3482146 , ..., 0.17451905, 0.1808793 ,
       0.07850679])

### TextBlob and VADER

In [23]:
polarity = []
subjectivity = []
for tweet in df['lemmatizer_tweets']:
    analysis = TextBlob(tweet)
    polar = analysis.sentiment.polarity
    sub = analysis.sentiment.subjectivity
    polarity.append(polar)
    subjectivity.append(sub)
    
df['textblob_polarity'] = polarity
df['textblob_subjectivity'] = subjectivity


In [24]:
analyzer = SentimentIntensityAnalyzer()

vs_neg = []
vs_neu = []
vs_pos = []
vs_compund = []
for tweet in df['lemmatizer_tweets']:
    vs = analyzer.polarity_scores(tweet)
    neg = vs['neg']
    neu = vs['neu']
    pos = vs['pos']
    compound = vs['compound']
    
    vs_neg.append(neg)
    vs_neu.append(neu)
    vs_pos.append(pos)
    vs_compund.append(compound)

    
df['vs_neg'] = vs_neg
df['vs_neu'] = vs_neu
df['vs_pos'] = vs_pos
df['vs_compound'] = vs_compund

In [None]:
# Adding sentiments from varying libraries

# This takes 25 minutes to run
import py_files.sentiment_lib as sent
df = sent.get_sentiment(df)
df.to_csv('dataframe.csv')

df = pd.read_csv('data/dataframe.csv')

In [None]:
df.head(2)

### Modeling

In [None]:
data = df['lemmatizer_tweets']
target = df['emotion']

In [None]:
# creating a list with all lemmatized outputs
lemmatized_output = []

for listy in data:
    lemmed = ''.join([w for w in listy])
    lemmatized_output.append(lemmed)

In [None]:
X_lem = lemmatized_output
y_lem = target

In [None]:
# train test split the lemmatized words
X_train_lem, X_test_lem, y_train_lem, y_test_lem = train_test_split(X_lem, y_lem, test_size=0.20, random_state=1)
tfidf = TfidfVectorizer(ngram_range=(1,2), stop_words=stop_words)

tfidf_data_train_lem = tfidf.fit_transform(X_train_lem)
tfidf_data_test_lem = tfidf.transform(X_test_lem)

In [None]:
# Average Number of Non-Zero Elements in Vectorized Tweets
non_zero_cols = tfidf_data_train_lem.nnz / float(tfidf_data_train_lem.shape[0])
print(non_zero_cols)

# Percentage of columns containing ZERO
percent_sparse = 1 - (non_zero_cols / float(tfidf_data_train_lem.shape[1]))
print(percent_sparse)

In [None]:
rf_lem = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)


In [None]:
rf_lem.fit(tfidf_data_train_lem, y_train_lem)

In [None]:
rf_test_preds_lem = rf_lem.predict(tfidf_data_test_lem)

In [None]:
rf_acc_score_lem = metrics.accuracy_score(y_test_lem, rf_test_preds_lem)
rf_f1_score_lem = metrics.f1_score(y_test_lem, rf_test_preds_lem, average='weighted')
rf_precision_score_lem = metrics.precision_score(y_test_lem, rf_test_preds_lem, average='weighted')
rf_recall_score_lem = metrics.recall_score(y_test_lem, rf_test_preds_lem, average='weighted')
print('Accuracy:', rf_acc_score_lem)
print('Precision:',rf_precision_score_lem)
print('Recall:',rf_recall_score_lem)
print('F1:',rf_f1_score_lem)