In [18]:
import re
import string
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
import pandas as pd
from matplotlib import pyplot as plt
from collections import defaultdict
import seaborn as sns

In [20]:
pd.set_option('display.max_columns', None)

In [21]:
tweets_df = pd.read_csv(r'./tweets.csv')
tweets_df = tweets_df['Tweet'].astype('str')
tweets_df.head(10)

0    I have to say, Apple has by far the best custo...
1    iOS 7 is so fricking smooth & beautiful!! #Tha...
2                                        LOVE U @APPLE
3    Thank you @apple, loving my new iPhone 5S!!!!!...
4    .@apple has the best customer service. In and ...
5    @apple ear pods are AMAZING! Best sound from i...
6    Omg the iPhone 5S is so cool it can read your ...
7              the iPhone 5c is so beautiful <3 @Apple
8    #AttributeOwnership is exactly why @apple will...
9    Just checked out the specs on the new iOS 7......
Name: Tweet, dtype: object

In [22]:
new_stop_words = [
    'some', 'like', 'think', 'wow', 'one', 'http', 'web', 'really',
    'see', 'watch', 'apple', 'know', 'show', 'think', 'click', 'go', 'to', 'great',
    'very', 'good', 'many', 'more', 'people', 'made', 'technology', 'tech',
    'iphone', 'ipad', 'new', 'latest', 'phone', 'itunes', 'brand', 'ipod', 'iphones',
    'io', 'get', 'buy', 'purchase', 'make', 'im', "iam", 'dont', 'cant', 'promoipodplayerpromo',
    'ipodplayerpromo', 'player', 'itune']

def pre_process_text(df, custom_stopwords):
    translate_table = dict((ord(char), None) for char in string.punctuation)
    stop_words = set(stopwords.words('english'))
    stop_words = stop_words.union(custom_stopwords)

    corpus = []
    for i, line in df.iteritems():
        line = line.lower()
        line = re.sub(r"\d+", "", line)
        line = line.translate(translate_table)
        line = line.split()

        # Lemmatizers reduces each word to its root/canonical form
        lm = WordNetLemmatizer()
        line = [lm.lemmatize(word) for word in line if not word in stop_words]
        line = " ".join(line)
        corpus.append(line)
    return corpus

In [23]:
def get_sentiment_score(sentence_series):
    df_dict = {}
    for i, response in sentence_series.iteritems():
        sent_analyzer = SentimentIntensityAnalyzer()
        scores = sent_analyzer.polarity_scores(response)
        df_dict[i] = scores
    df = pd.DataFrame.from_dict(df_dict)
    return df.T

In [24]:
tweets_series = pd.Series(pre_process_text(tweets_df, new_stop_words))
tweets_series

0       say far best customer care service ever receiv...
1                 io fricking smooth beautiful thanxapple
2                                                  love u
3                     thank loving pictwittercomxmhjcupcb
4                               best customer service min
                              ...                        
1176                                                freak
1177            freaking picture tl annoyed freak twitter
1178                                   freaking cow freak
1179                             hate working going freak
1180                   agounalakis thats nasty nasty brat
Length: 1181, dtype: object

In [28]:
df = get_sentiment_score(tweets_series)

In [29]:
df['Tweet'] = tweets_series.iloc[:]
df.head(10)

Unnamed: 0,neg,neu,pos,compound,Tweet
0,0.0,0.486,0.514,0.8126,say far best customer care service ever receiv...
1,0.0,0.489,0.511,0.6344,io fricking smooth beautiful thanxapple
2,0.0,0.0,1.0,0.6369,love u
3,0.0,0.135,0.865,0.7506,thank loving pictwittercomxmhjcupcb
4,0.0,0.417,0.583,0.6369,best customer service min
5,0.0,0.467,0.533,0.8402,ear pod amazing best sound inear headphone ive...
6,0.0,0.777,0.223,0.3182,omg cool read finger print unlock purchase wit...
7,0.0,0.0,1.0,0.5994,c beautiful
8,0.0,0.729,0.271,0.3818,attributeownership exactly always marketing ma...
9,0.0,1.0,0.0,0.0,checked spec io say wait update bravo


In [32]:
# Looking at the most positive tweets
df.sort_values(by='compound', ascending=False)['Tweet'].head(10)

63      ibrooklynb loved iphonec better cuz name aweso...
1033    nokia release amazing smartphones within year ...
232     darnit lol donza mishiza natz samsungsa perfec...
95      currently waiting review app hopefully asap th...
13      interesting seem almost willing demise whats g...
5       ear pod amazing best sound inear headphone ive...
644       google laughing laughing left love chrome stink
114     dear excellent ad submitted update please appr...
92      excited introducing touchid biometric security...
46      totally missed giving away iphoto imovie iwork...
Name: Tweet, dtype: object

In [None]:
df.insert(0, 'ID', range(0, len(df)))

In [37]:
# Group customers as either Promoters, Detractors, and Neutrals
df_group = df.groupby(by='ID').mean()
df_group.reset_index(inplace=True)
df_group.head(10)

Unnamed: 0,ID,neg,neu,pos,compound
0,0,0.0,0.486,0.514,0.8126
1,1,0.0,0.489,0.511,0.6344
2,2,0.0,0.0,1.0,0.6369
3,3,0.0,0.135,0.865,0.7506
4,4,0.0,0.417,0.583,0.6369
5,5,0.0,0.467,0.533,0.8402
6,6,0.0,0.777,0.223,0.3182
7,7,0.0,0.0,1.0,0.5994
8,8,0.0,0.729,0.271,0.3818
9,9,0.0,1.0,0.0,0.0


In [42]:
df_group.sort_values(by='compound', inplace=True)
print(df_group.head())
print('------------------------------------------')
print(df_group.tail(10))

        ID    neg    neu    pos  compound
1135  1135  0.840  0.160  0.000   -0.9413
1173  1173  0.714  0.153  0.133   -0.9246
1124  1124  0.714  0.286  0.000   -0.9100
1151  1151  0.914  0.086  0.000   -0.8910
1038  1038  0.529  0.471  0.000   -0.8779
------------------------------------------
        ID    neg    neu    pos  compound
46      46  0.121  0.389  0.490    0.8218
114    114  0.000  0.449  0.551    0.8225
92      92  0.000  0.455  0.545    0.8225
644    644  0.166  0.184  0.650    0.8360
5        5  0.000  0.467  0.533    0.8402
13      13  0.000  0.462  0.538    0.8519
95      95  0.000  0.359  0.641    0.8658
232    232  0.000  0.452  0.548    0.8658
1033  1033  0.000  0.466  0.534    0.8834
63      63  0.000  0.223  0.777    0.9313
