Business case:
- Consulting Apple and Google on most loved things (product, service, app, etc) for them to invest in promotion of based on SXSW tweets

**AVOID FALSE POSITIVE - MAXIMIZE PRECISION**
- False Positive: a negative or neutral tweet is classified as positive and company invests in promoting something that customers view negatively
    - False Positive is worse of the two because it leads to negative customer experience, lower NPS, and brand suffers in longer term. Basically spending money for customer to have negative experience.
- False Negative: a positive tweet is classified as negative, and company misses opportunity to invest in promoting something that customers view positively

In [895]:
import pandas as pd

import re #regex

import nltk
from nltk.stem import WordNetLemmatizer

# Tweet preprocessor - Source: https://towardsdatascience.com/basic-tweet-preprocessing-in-python-efd8360d529e
import preprocessor as p

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

import string

In [896]:
# Import file
raw_data = pd.read_csv('../data/judge-1377884607_tweet_product_company.csv', encoding= 'unicode_escape')
df = raw_data.copy()

# Preview file 
df.head(10)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
5,@teachntech00 New iPad Apps For #SpeechTherapy...,,No emotion toward brand or product
6,,,No emotion toward brand or product
7,"#SXSW is just starting, #CTIA is around the co...",Android,Positive emotion
8,Beautifully smart and simple idea RT @madebyma...,iPad or iPhone App,Positive emotion
9,Counting down the days to #sxsw plus strong Ca...,Apple,Positive emotion


In [897]:
# Overview file
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
tweet_text                                            9092 non-null object
emotion_in_tweet_is_directed_at                       3291 non-null object
is_there_an_emotion_directed_at_a_brand_or_product    9093 non-null object
dtypes: object(3)
memory usage: 213.2+ KB


In [898]:
# Fill nulls
df['emotion_in_tweet_is_directed_at'].fillna('None', inplace=True)

In [899]:
# drop row 6, tweet_text null row; 9092 row is foreign characters
df.drop(labels=[6, 9092], axis=0, inplace=True)

# reset index post drop
df = df.reset_index(drop=True)

In [900]:
# Value counts exploration
df['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

No emotion toward brand or product    5387
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

In [901]:
# Filter down emotions to Neutral
df['is_there_an_emotion_directed_at_a_brand_or_product'] = df['is_there_an_emotion_directed_at_a_brand_or_product'].map({"No emotion toward brand or product" : "Neutral",
                                                                                                                         "Positive emotion": "Positive",
                                                                                                                         "Negative emotion": "Negative",
                                                                                                                         "I can't tell": "Neutral"})

In [902]:
df['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

Neutral     5543
Positive    2978
Negative     570
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

In [903]:
# Create target; Positive only - 32%
df['target'] = df['is_there_an_emotion_directed_at_a_brand_or_product'].map({"Positive": 1,
                                                                             "Neutral": 0,
                                                                             "Negative": 0})

In [904]:
# Check work
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9091 entries, 0 to 9090
Data columns (total 4 columns):
tweet_text                                            9091 non-null object
emotion_in_tweet_is_directed_at                       9091 non-null object
is_there_an_emotion_directed_at_a_brand_or_product    9091 non-null object
target                                                9091 non-null int64
dtypes: int64(1), object(3)
memory usage: 284.2+ KB


### Processing Tweets

In [905]:
# Createw new column for cleaned tweet text
df['clean_tweet'] = df['tweet_text'].copy()

#### Replace hashtags, links, rts, and mentions

In [906]:
# Using placeholder(ph) because if use {hashtag}, 
# tokenizing will be wrong, separates first { then rest of string

# Replace hashtags with placeholder(HASHPH)
df['clean_tweet'] = df['clean_tweet'].replace({'#':'hashph'}, regex=True)

# Replace links with placeholder(LINKPH)
df['clean_tweet'] = df['clean_tweet'].replace({'http':'linkph',
                                              '{link}':'linkph'}, regex=True)

# Replace RT with placeholder(RTPH)
df['clean_tweet'] = df['clean_tweet'].replace({'RT':'rtph'}, regex=True)
# Did not account for rt (stand alone string)

# Replace mentions with placeholder(MENPH)
df['clean_tweet'] = df['clean_tweet'].replace({'@mention':'menph'}, regex=True)
# Account for mentions that do not have '@mention' in the original text and have usernames
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: re.sub(r'@\w+', 'menph', (x)))

In [907]:
len(df[df['clean_tweet'].str.contains('hashph')])
# Almost all contain hashtags

9085

In [908]:
len(df[df['clean_tweet'].str.contains('linkph')])
# 51% contain links

4193

In [909]:
len(df[df['clean_tweet'].str.contains('rtph')])
# 29% contain RTs (systematic, lowercase rts not accounted for)

2686

In [910]:
len(df[df['clean_tweet'].str.contains('linkph')])
# 51% contain links

4193

In [911]:
len(df[df['clean_tweet'].str.contains('menph')])
# 54% contain mentions

4918

#### Cleaning html, removing punctuation, lowercasing

In [912]:
# Clean 'clean_tweet' column of HTML; there were things like &quot
html_ent_clean = re.compile('&.*?;')
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: re.sub(html_ent_clean, '',x))

# Remove punctuation
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: re.sub(r'[^\w\s]', '', (x)))

# Source: https://towardsdatascience.com/basic-tweet-preprocessing-in-python-efd8360d529e

In [913]:
# Preview clean text column, sanity check
df['clean_tweet']

0       menph I have a 3G iPhone After 3 hrs tweeting ...
1       menph Know about menph  Awesome iPadiPhone app...
2       menph Can not wait for hashphiPad 2 also They ...
3       menph I hope this years festival isnt as crash...
4       menph great stuff on Fri hashphSXSW Marissa Ma...
                              ...                        
9086    menph Yup but I dont have a third app yet Im o...
9087                    Ipad everywhere hashphSXSW linkph
9088    Wave buzz rtph menph We interrupt your regular...
9089    Googles Zeiger a physician never reported pote...
9090    Some Verizon iPhone customers complained their...
Name: clean_tweet, Length: 9091, dtype: object

In [914]:
# Source: https://www.dataquest.io/blog/tutorial-text-classification-in-python-using-spacy/
# Create our list of stopwords
nlp = spacy.load("en_core_web_sm")
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [915]:
# Remove spacy stopwords
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: ' '.join(
    [word for word in x.split() if word.lower() not in (stop_words)]))

In [916]:
# Lowercase text
df['clean_tweet'] = df['clean_tweet'].str.lower()

In [917]:
# Create new column for tokenized tweets
df['token_tweet'] = ""

In [918]:
# Create function to tokenize with spacy

def tokenize_tweet(tweet):
    my_tweet = nlp(tweet) 
    token_list = []
    for token in my_tweet:
        token_list.append(token.text)
    return token_list

In [None]:
# Create token tweet values
df['token_tweet'] = df['clean_tweet'].apply(tokenize_tweet)

In [None]:
# Preview new column
df['token_tweet']

In [None]:
# Review df overview
df.info()

In [None]:
# Lemmatization
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in text]

df['token_tweet'] = df['token_tweet'].apply(lemmatize_text)

# Rejoin in new column
df['clean_token_tweet'] = df['token_tweet'].map(lambda x: ' '.join(x))

# Source: https://stackoverflow.com/questions/59567357/lemmatize-tokenised-column-in-pandas

In [None]:
# Preview new column 
df['clean_token_tweet']

In [None]:
# Review df
df.info()

#### Playing with SpaCy

In [36]:
import spacy

In [99]:
df['tweet_text'][100]

'\x89ÛÏ@mention &quot;Apple has opened a pop-up store in Austin so the nerds in town for #SXSW can get their new iPads. {link} #wow'

In [185]:
# Word tokenization
from spacy.lang.en import English

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

text = df['tweet_text'][0]

#  "nlp" Object is used to create documents with linguistic annotations.
my_doc = nlp(text)
my_doc

.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.

In [186]:
# Create list of word tokens
token_list = []
for token in my_doc:
    token_list.append(token.text)
print(token_list)

['.@wesley83', 'I', 'have', 'a', '3', 'G', 'iPhone', '.', 'After', '3', 'hrs', 'tweeting', 'at', '#', 'RISE_Austin', ',', 'it', 'was', 'dead', '!', ' ', 'I', 'need', 'to', 'upgrade', '.', 'Plugin', 'stations', 'at', '#', 'SXSW', '.']


In [187]:
from spacy.lang.en.stop_words import STOP_WORDS

#Implementation of stop words:
filtered_sent=[]

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# filtering stop words
for word in doc:
    if word.is_stop==False:
        filtered_sent.append(word)
print("Filtered Sentence:",filtered_sent)

Filtered Sentence: [.@wesley83, 3, G, iPhone, ., 3, hrs, tweeting, #, RISE_Austin, ,, dead, !,  , need, upgrade, ., Plugin, stations, #, SXSW, .]


In [47]:
tweet = nlp(df['tweet_text'][100])
tweet

New buzz? &quot;@mention Google to Launch Major New Social Network Called Circles, Possibly Today {link} rt @mention #sxsw&quot;

In [48]:
from spacy import displacy

entities=[(i, i.label_, i.label) for i in tweet.ents]
entities

[]

#### Parsing out unique words in positive versus neutral/negative tweets

In [874]:
positive_tweet_text = df[df['target'] == 1]['tweet_text']
neut_neg_tweet_text = df[df['target'] == 0]['tweet_text']

In [875]:
pos_token_list = []

for tweet in positive_tweet_text:
    tweet = nlp(tweet)
    for token in tweet:
        pos_token_list.append(token.text)
        
pos_token_list_set = set(pos_token_list)

len(pos_token_list_set)

6769

In [879]:
neut_neg_token_list = []

for tweet in neut_neg_tweet_text:
    tweet = nlp(tweet)
    for token in tweet:
        neut_neg_token_list.append(token.text)
        
neut_neg_token_list_set = set(neut_neg_token_list)

len(neut_neg_token_list_set)
# 60% more unique than positive

10893

In [None]:
neut_neg_token_list_set

In [None]:
pos_token_list_unique = pos_token_list_set.difference(neut_neg_token_list_set)
# new set with elements in pos_token_list_set but not in neut_neg_token_list_set
len(pos_token_list_unique)

In [None]:
neut_neg_token_list_unique = neut_neg_token_list_set.difference(pos_token_list_set)
# new set with elements in neut_neg_token_list_set but not in pos_token_list_set
len(neut_neg_token_list_unique)

#### Try Parsing with Tokenized

In [877]:
positive_tweet_text2 = df[df['target'] == 1]['token_tweet']
neut_neg_tweet_text2 = df[df['target'] == 0]['token_tweet']

In [873]:
pos_token_list2 = []

for tweet in positive_tweet_text:
    for token in tweet:
        pos_token_list2.append(token)
        
len(set(pos_token_list2))

5112

In [883]:
pos_token_list_set2 = set(pos_token_list2)

In [878]:
neut_neg_token_list2 = []

for tweet in neut_neg_tweet_text2:
    for token in tweet:
        neut_neg_token_list2.append(token)
        
len(set(neut_neg_token_list2))

8173

In [884]:
neut_neg_token_list_set2 = set(neut_neg_token_list2)

In [887]:
pos_token_list_unique2 = pos_token_list_set2.difference(neut_neg_token_list_set2)
# new set with elements in pos_token_list_set but not in neut_neg_token_list_set
len(pos_token_list_unique2)

1922

In [888]:
pos_token_list_unique2

{'sitelinkphing',
 'hashphfandango',
 'sessionsnxt',
 'itslinkphing',
 'relaxinglinkphputer',
 'hashphdesignflaws',
 'inde',
 'everbody',
 'ripped',
 'passage',
 'applinkphe',
 'sprinkle',
 'offerlinkphpared',
 'salesperson',
 'singing',
 'á¾_î¾ð_____ôèï_ãöýçü¼¼',
 'iconbuffet',
 'goona',
 'measuring',
 'everyday',
 'incredibly',
 'drafthouse',
 'rigeur',
 'articulate',
 'presos',
 'amused',
 'nick',
 'smoke',
 'starbu',
 'hashphempowered',
 'enchanting',
 'twitpic',
 'itme',
 'hashphcircusmash',
 'wot',
 'filming',
 'scannercreators',
 'todo',
 'tenet',
 'wowûïmenph',
 '11th',
 'hashphagchathashphsxsw',
 'julie',
 'wpeople',
 'smooth',
 'hashphfxsw',
 'locationmap',
 'palette',
 'brilliance',
 'sxsurrogateslinkph',
 'appslinkphing',
 'hashphpopupstoreû',
 'v5',
 'installs',
 'methinks',
 'innovating',
 'penguin',
 'lineand',
 'hashphpseudoretweet',
 'attracting',
 'phew',
 'hashphgooglebread',
 'kenny',
 'mercy',
 'solves',
 'seereally',
 'spasmatics',
 'sunglass',
 'hashph4sqs',
 'ha

In [889]:
neut_neg_token_list_unique2 = neut_neg_token_list_set2.difference(pos_token_list_set2)
# new set with elements in neut_neg_token_list_set but not in pos_token_list_set
len(neut_neg_token_list_unique2)

4983

In [890]:
neut_neg_token_list_unique2

{'voluntary',
 'hashphtherocksreport',
 'snowflake',
 'ridley',
 'foråêhashphiphone',
 '459pm',
 'boothe',
 'checklist',
 'islinkphpeting',
 'hashphtabletwars',
 'hashphmusicmonday',
 'se',
 'balance',
 'sprinkler',
 'hashphwiiings',
 'ridonkulous',
 'math',
 'aggregated',
 'reference',
 'hashphproductive',
 'cartridge',
 'citymarket',
 'employed',
 'hulu',
 'imparted',
 'dudesjust',
 'jackie',
 'nonmacbook',
 'cellbots',
 'drew',
 'hashphsmccolumbus',
 'swimsuit',
 'insidious',
 'promos',
 'hashphehteam',
 'fanbois',
 'tribunelinkphe',
 'glitch',
 'lined',
 'hashphbeevil',
 'meanness',
 'pointing',
 'splendor',
 'advertisingcheckins',
 'contû',
 'paging',
 'plutopia',
 'den',
 'slope',
 'cellphone',
 'sex',
 'typ63mmam7w3',
 'twtng',
 'marcins',
 'discussing',
 'hashphbegger',
 'ingredient',
 'hashphcbs',
 'hashphgrauniad',
 'hashphlocal',
 'probar',
 '031311',
 'roasted',
 'arthaus',
 'mobilefirst',
 '3bil',
 'pake',
 'skype',
 'sheer',
 'ding',
 'hashphsaytextson',
 'hashphbadform',

In [None]:
### Testing other stuff

In [194]:
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

In [197]:
positive_tweet_text

1       @jessedee Know about @fludapp ? Awesome iPad/i...
2       @swonderlin Can not wait for #iPad 2 also. The...
4       @sxtxstate great stuff on Fri #SXSW: Marissa M...
7       #SXSW is just starting, #CTIA is around the co...
8       Beautifully smart and simple idea RT @madebyma...
                              ...                        
9072    @mention your iPhone 4 cases are Rad and Ready...
9077    @mention your PR guy just convinced me to swit...
9079    &quot;papyrus...sort of like the ipad&quot; - ...
9085    I've always used Camera+ for my iPhone b/c it ...
9088                        Ipad everywhere. #SXSW {link}
Name: tweet_text, Length: 2978, dtype: object

In [225]:
import en_core_web_sm
nlp = en_core_web_sm.load()
sxsw = nlp('sxsw')
print(sxsw.vector.shape)
print(sxsw.vector)

(96,)
[ 1.5080006   0.20474844  0.5546193  -0.90579414 -0.54141414 -0.9824686
  0.5253784  -1.4117908  -0.8251685  -0.15485895  0.55209285  0.7007918
  0.48446506  0.34851265  0.19821697  0.6071049   0.92223513 -0.8365748
 -0.07147411  0.5213211  -0.45954704  0.73926437 -0.8330599   0.32836628
 -0.00541195  0.10246983 -0.28939652 -0.32321608 -0.07963422 -0.0561545
 -1.2349129  -0.5410965  -0.23048031 -0.9607055   0.59256613  0.54079735
 -1.8728933  -0.00399333 -0.06148595  0.6371069  -0.05743659 -0.6361483
 -0.80609286  1.0348423   0.2242944  -0.23417483  0.29439375 -0.35090202
  0.19317758 -0.3185281   0.51226294  1.1309446  -0.41559738  0.15457207
  1.6295617  -1.2565141   0.20126943  0.01892669  1.7639434   0.4675427
 -0.15381673 -0.40200907 -0.2869364  -0.19344324 -0.68605137  0.00293952
 -0.21419257 -0.29355788 -0.3246354  -0.36205465  1.5996232  -0.6480825
  0.4077111  -0.5152909  -0.64171624  0.29932743 -1.3886542  -0.5850692
 -0.10415816  1.2317246   0.77090836 -0.73733115  0.4

In [248]:
from collections import Counter

In [254]:
for tweet in df['tweet_text'][:5]:
    print(tweet)

.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.
@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW
@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.
@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw
@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)


In [253]:
count_all = Counter()
for tweet in df['tweet_text']:
    # Create a list with all the terms
    terms_all = [term for term in df['tweet_text']]
    # Update the counter
    count_all.update(terms_all)
# Print the first 5 most frequent words
print(count_all.most_common(5))

KeyboardInterrupt: 

In [205]:
#hashtags
# Count terms only once, equivalent to Document Frequency
terms_single = set(terms_all)
# Count hashtags only
terms_hash = [term for term in preprocess(tweet['text']) 
              if term.startswith('#')]
# Count terms only (no hashtags, no mentions)
terms_only = [term for term in preprocess(tweet['text']) 
              if term not in stop and
              not term.startswith(('#', '@'))] 
              # mind the ((double brackets))
              # startswith() takes a tuple (not a list) if 
              # we pass a list of inputs

[]
[]
[]
[]
[]
