## Data Cleaning

In [1]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from sklearn import metrics
from sklearn.model_selection import train_test_split
import string, re

#### Import and Clean Dataset

In [2]:
# read in tweets
# src: https://data.world/crowdflower/brands-and-product-emotions
df = pd.read_csv('../data/tweets.csv', encoding = "ISO-8859-1")
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [3]:
# rename columns
df.rename(columns ={'tweet_text': 'tweet',
                    'emotion_in_tweet_is_directed_at': 'product_',
                    'is_there_an_emotion_directed_at_a_brand_or_product': 'emotion'}, inplace = True)

In [4]:
# put null values into other category
df['product_'] = np.where(df['product_'].isnull(), 'Unknown', df['product_'])

In [5]:
# drop the one null tweet
df.dropna(inplace=True)

In [6]:
# target variable
df.emotion.value_counts()

No emotion toward brand or product    5388
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: emotion, dtype: int64

In [7]:
# assign target varible numbers
df['emotion'] = np.where(df['emotion'] == "I can't tell", 2, df['emotion'])
df['emotion'] = np.where(df['emotion'] == 'No emotion toward brand or product', 3, df['emotion'])
df['emotion'] = np.where(df['emotion'] == 'Positive emotion', 1, df['emotion'])
df['emotion'] = np.where(df['emotion'] == 'Negative emotion', 0, df['emotion'])

In [8]:
# update data type
df['emotion'] = df['emotion'].astype('int64')

In [9]:
## setting stopwords and punctuations
stop_words=stopwords.words("english")
stop_words += list(string.punctuation)
stop_words += ['...','u','w','2',"i'm",'via',"we're",'6','3','hey']
# print(stop_words)
sw_set = set(stop_words)
# sw_set

In [10]:
def process_tweet(tweet):
    tokenizer = RegexpTokenizer(r"(iPad\s2|[a-zA-Z0-9-]+'?\w+)")

    tokens = tokenizer.tokenize(tweet)
    sw_removed = [token.lower().replace(" ","") for token in tokens if token.lower() not in sw_set]
    return sw_removed

In [11]:
items = []
for item in df['tweet']:
    item = process_tweet(item)
    items.append(item)
df['tweet'] = items

### Stemming

In [12]:
# ps = PorterStemmer()

# stemmed_tweets=[]
# for row in df['tweet']:
#     new_row = []
#     for w in row:
#         new_row.append(ps.stem(w))
#     stemmed_tweets.append(new_row)
        
# df['stemmed_tweets'] =  stemmed_tweets

### Lemmatization

In [13]:
lemmatizer = WordNetLemmatizer() 

lemmatizer_tweets=[]
for row in df['tweet']:
    new_row = []
    for w in row:
        new_row.append(lemmatizer.lemmatize(w))
    lemmatizer_tweets.append(new_row)
        
df['lemmatizer_tweets'] =  lemmatizer_tweets

In [14]:
df['lemmatizer_tweets'].head()

0    [wesley83, 3g, iphone, hr, tweeting, rise_aust...
1    [jessedee, know, fludapp, awesome, ipad, iphon...
2          [swonderlin, wait, ipad2, also, sale, sxsw]
3    [sxsw, hope, year's, festival, crashy, year's,...
4    [sxtxstate, great, stuff, fri, sxsw, marissa, ...
Name: lemmatizer_tweets, dtype: object

In [15]:
# Concat words in tweet series
new_lem_tweets = []
for item in df['lemmatizer_tweets']:
    obj = ''
    for w in item:
        obj = obj + w + ' '
    new_lem_tweets.append(obj)

df['lemmatizer_tweets'] = new_lem_tweets

In [16]:
## using this as the data target had better performance
tf=TfidfVectorizer()
text_tf= tf.fit_transform(df['lemmatizer_tweets'])
text_tf.data

array([0.04098343, 0.30909751, 0.3482146 , ..., 0.17451905, 0.1808793 ,
       0.07850679])

### TextBlob and VADER

In [17]:
polarity = []
subjectivity = []
for tweet in df['lemmatizer_tweets']:
    analysis = TextBlob(tweet)
    polar = analysis.sentiment.polarity
    sub = analysis.sentiment.subjectivity
    polarity.append(polar)
    subjectivity.append(sub)
    
df['textblob_polarity'] = polarity
df['textblob_subjectivity'] = subjectivity


In [18]:
analyzer = SentimentIntensityAnalyzer()

vs_neg = []
vs_neu = []
vs_pos = []
vs_compund = []
for tweet in df['lemmatizer_tweets']:
    vs = analyzer.polarity_scores(tweet)
    neg = vs['neg']
    neu = vs['neu']
    pos = vs['pos']
    compound = vs['compound']
    
    vs_neg.append(neg)
    vs_neu.append(neu)
    vs_pos.append(pos)
    vs_compund.append(compound)

    
df['vs_neg'] = vs_neg
df['vs_neu'] = vs_neu
df['vs_pos'] = vs_pos
df['vs_compound'] = vs_compund

In [19]:
# Adding sentiments from varying libraries

# This takes 25 minutes to run
# import py_files.sentiment_lib as sent
# df = sent.get_sentiment(df)
# df.to_csv('dataframe.csv')

df = pd.read_csv('../data/dataframe.csv', index_col=0)

In [20]:
df.head(2)

Unnamed: 0,tweet,product_,emotion,lemmatizer_tweets,textblob_polarity,textblob_subjectivity,vs_neg,vs_neu,vs_pos,vs_compound,nrc_sentiment,gi_sentiment,henry_sentiment,huliu_sentiment,jockers_sentiment,lm_sentiment,senticnet_sentiment,sentiword_sentiment,socal_sentiment
0,"['wesley83', 'have', '3G', 'iPhone', '3', 'hrs...",iPhone,0,wesley83 have 3G iPhone 3 hr tweeting RISE Aus...,-0.2,0.4,0.223,0.777,0.0,-0.6486,0.0,-0.333333,0.0,-1.0,-1.0,0.0,-0.0952,-0.221875,-1.192154
1,"['jessedee', 'Know', 'fludapp', 'Awesome', 'iP...",iPad or iPhone App,1,jessedee Know fludapp Awesome iPad iPhone app ...,0.466667,0.933333,0.0,0.528,0.472,0.91,1.0,1.0,0.0,1.0,0.416667,0.0,0.475,0.175,2.17719
