In [142]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

In [143]:
# import file
df = pd.read_csv('judge-1377884607_tweet_product_company.csv', encoding= 'unicode_escape')

In [144]:
# Preview file 
df.head(10)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
5,@teachntech00 New iPad Apps For #SpeechTherapy...,,No emotion toward brand or product
6,,,No emotion toward brand or product
7,"#SXSW is just starting, #CTIA is around the co...",Android,Positive emotion
8,Beautifully smart and simple idea RT @madebyma...,iPad or iPhone App,Positive emotion
9,Counting down the days to #sxsw plus strong Ca...,Apple,Positive emotion


In [145]:
# Overview file
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
tweet_text                                            9092 non-null object
emotion_in_tweet_is_directed_at                       3291 non-null object
is_there_an_emotion_directed_at_a_brand_or_product    9093 non-null object
dtypes: object(3)
memory usage: 213.2+ KB


In [146]:
# Value counts exploration
df['emotion_in_tweet_is_directed_at'].value_counts()

iPad                               946
Apple                              661
iPad or iPhone App                 470
Google                             430
iPhone                             297
Other Google product or service    293
Android App                         81
Android                             78
Other Apple product or service      35
Name: emotion_in_tweet_is_directed_at, dtype: int64

In [147]:
# Fill nulls
df['emotion_in_tweet_is_directed_at'].fillna('None', inplace=True)

In [148]:
# Recheck
df['emotion_in_tweet_is_directed_at'].value_counts()

None                               5802
iPad                                946
Apple                               661
iPad or iPhone App                  470
Google                              430
iPhone                              297
Other Google product or service     293
Android App                          81
Android                              78
Other Apple product or service       35
Name: emotion_in_tweet_is_directed_at, dtype: int64

In [149]:
# Value counts exploration
df['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

In [150]:
# Filter down directed at to company in new column to preserve original detail
df['brand_directed_at'] = df['emotion_in_tweet_is_directed_at'].map({'iPad': 'Apple',
                                                                    'Apple':'Apple',
                                                                    'iPad or iPhone App': 'Apple',
                                                                    'Google': 'Google', 
                                                                    'iPhone':'Apple',
                                                                    'Other Google product or service': 'Google',
                                                                    'Android App': 'Google', 
                                                                    'Android': 'Google',
                                                                    'Other Apple product or service':'Apple',
                                                                    'None':'None'})

In [151]:
# Review value counts
df['brand_directed_at'].value_counts()

None      5802
Apple     2409
Google     882
Name: brand_directed_at, dtype: int64

In [152]:
# Filter down emotions to Neutral - may create new column for positive or not (30% positive, could use for classification)
df['emotion_directed_at'] = df['is_there_an_emotion_directed_at_a_brand_or_product'].map({"No emotion toward brand or product" : "Neutral",
                                                                                           "Positive emotion": "Positive",
                                                                                           "Negative emotion": "Negative",
                                                                                           "I can't tell": "Neutral"})

In [153]:
# Value count check
df['emotion_directed_at'].value_counts()

Neutral     5545
Positive    2978
Negative     570
Name: emotion_directed_at, dtype: int64

In [154]:
# New Preview
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,brand_directed_at,emotion_directed_at
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,Apple,Negative
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,Apple,Positive
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,Apple,Positive
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,Apple,Negative
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,Google,Positive


In [155]:
# Drop nulls
df['tweet_text'] = df['tweet_text'].dropna()

In [156]:
# Got error trying to pull out hastags, make sure Series values are strings
df['tweet_text']=df['tweet_text'].apply(str)

#### Feat Eng

In [157]:
# source: https://towardsdatascience.com/basic-tweet-preprocessing-in-python-efd8360d529e
# Create new column and insert in hashtags
# df['hashtag'] = df['tweet_text'].apply(lambda x: re.findall(r'\B#\w*[a-zA-Z]+\w*', x)) <- keep hash
df['hashtag'] = df['tweet_text'].apply(lambda x: re.findall(r'#(\w+)', x)) #capture word group after hash sign; without hash

In [158]:
# Same for mentions
# df['mention'] = df['tweet_text'].apply(lambda x: re.findall(r'\B@\w*[a-zA-Z]+\w*', x))  <- keep @
df['mention'] = df['tweet_text'].apply(lambda x: re.findall(r'@(\w+)', x)) #capture word group after @ sign;without @

In [159]:
# How many tweets have contain URLs? Too small for feat eng
len(df[df['tweet_text'].str.contains('http') == True])

28

In [160]:
# How many tweets have contain RT?
len(df[df['tweet_text'].str.contains('RT', case=True) == True])

2677

In [161]:
# Label RTs
df['rt'] = [1 if 'RT' in df['tweet_text'][row] else 0 for row in df['tweet_text'].index]

In [162]:
# Check work
df['rt'].value_counts()

0    6416
1    2677
Name: rt, dtype: int64

In [163]:
df['reply'] = [1 if df['tweet_text'][row].startswith('@') else 0 for row in df['tweet_text'].index]

In [164]:
df['reply'].value_counts()

0    8447
1     646
Name: reply, dtype: int64

In [165]:
# Preview new
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,brand_directed_at,emotion_directed_at,hashtag,mention,rt,reply
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,Apple,Negative,"[RISE_Austin, SXSW]",[wesley83],0,0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,Apple,Positive,[SXSW],"[jessedee, fludapp]",0,1
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,Apple,Positive,"[iPad, SXSW]",[swonderlin],0,1
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,Apple,Negative,[sxsw],[sxsw],0,1
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,Google,Positive,[SXSW],[sxtxstate],0,1


#### Processing

- remove URLs, Mentions, HTML, punctuation
- lower case

In [166]:
#Tweet preprocessor test
import preprocessor as p

In [167]:
p.clean(df['tweet_text'][0])

'. I have a G iPhone. After hrs tweeting at , it was dead! I need to upgrade. Plugin stations at .'

In [168]:
# Create new column for cleaned text - remove URLs, Mentions
df['clean_text'] = df['tweet_text'].apply(lambda x: p.clean(x))

# Lower case
df['clean_text'] = df['clean_text'].str.lower()

In [169]:
# Preview new 
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,brand_directed_at,emotion_directed_at,hashtag,mention,rt,reply,clean_text
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,Apple,Negative,"[RISE_Austin, SXSW]",[wesley83],0,0,". i have a g iphone. after hrs tweeting at , i..."
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,Apple,Positive,[SXSW],"[jessedee, fludapp]",0,1,know about ? awesome ipad/iphone app that you'...
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,Apple,Positive,"[iPad, SXSW]",[swonderlin],0,1,can not wait for also. they should sale them d...
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,Apple,Negative,[sxsw],[sxsw],0,1,i hope this year's festival isn't as crashy as...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,Google,Positive,[SXSW],[sxtxstate],0,1,"great stuff on fri : marissa mayer (google), t..."


In [170]:
# Pull tweet with HTML &quot;
df['clean_text'][9090]

"google's zeiger, a physician never reported potential ae. yet fda relies on physicians. &quot;we're operating w/out data.&quot;"

In [171]:
# Test cleaning
html_ent_clean = re.compile('&.*?;')
re.sub(html_ent_clean, '', df['clean_text'][9090])

"google's zeiger, a physician never reported potential ae. yet fda relies on physicians. we're operating w/out data."

In [172]:
# Clean 'clean_text' column of HTML
df['clean_text'] = df['clean_text'].apply(lambda x: re.sub(html_ent_clean, '',x))

In [173]:
# Check work
len(df[df['clean_text'].str.contains(html_ent_clean) == True])

0

In [174]:
# Remove punctuation
df['clean_text'] = df['clean_text'].apply(lambda x: re.sub(r'[^\w\s]', '', (x)))

In [175]:
# Check work 
df['clean_text'][9090]

'googles zeiger a physician never reported potential ae yet fda relies on physicians were operating wout data'

#### Remove stopwords

In [176]:
stop_words = set(stopwords.words('english'))

In [177]:
df['clean_text'] = df['clean_text'].apply(lambda x: ' '.join(
    [word for word in x.split() if word.lower() not in (stop_words)]))

In [178]:
df['clean_text']

0       g iphone hrs tweeting dead need upgrade plugin...
1       know awesome ipadiphone app youll likely appre...
2                                          wait also sale
3        hope years festival isnt crashy years iphone app
4       great stuff fri marissa mayer google tim oreil...
                              ...                        
9088                                 ipad everywhere link
9089    wave buzz rt interrupt regularly scheduled gee...
9090    googles zeiger physician never reported potent...
9091    verizon iphone customers complained time fell ...
9092               ___rt google tests checkin offers link
Name: clean_text, Length: 9093, dtype: object

#### Tokenize

In [179]:
# Tokenize
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"

df['clean_text'] = df['clean_text'].apply(lambda x: nltk.regexp_tokenize(x, pattern))

In [180]:
# Preview
df['clean_text']

0       [g, iphone, hrs, tweeting, dead, need, upgrade...
1       [know, awesome, ipadiphone, app, youll, likely...
2                                      [wait, also, sale]
3       [hope, years, festival, isnt, crashy, years, i...
4       [great, stuff, fri, marissa, mayer, google, ti...
                              ...                        
9088                             [ipad, everywhere, link]
9089    [wave, buzz, rt, interrupt, regularly, schedul...
9090    [googles, zeiger, physician, never, reported, ...
9091    [verizon, iphone, customers, complained, time,...
9092           [rt, google, tests, checkin, offers, link]
Name: clean_text, Length: 9093, dtype: object

#### Lemmatization

In [181]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in text]

df['clean_text'] = df['clean_text'].apply(lemmatize_text)
# Source: https://stackoverflow.com/questions/59567357/lemmatize-tokenised-column-in-pandas

In [182]:
df['clean_text']

0       [g, iphone, hr, tweeting, dead, need, upgrade,...
1       [know, awesome, ipadiphone, app, youll, likely...
2                                      [wait, also, sale]
3       [hope, year, festival, isnt, crashy, year, iph...
4       [great, stuff, fri, marissa, mayer, google, ti...
                              ...                        
9088                             [ipad, everywhere, link]
9089    [wave, buzz, rt, interrupt, regularly, schedul...
9090    [google, zeiger, physician, never, reported, p...
9091    [verizon, iphone, customer, complained, time, ...
9092             [rt, google, test, checkin, offer, link]
Name: clean_text, Length: 9093, dtype: object

## STUCK 

In [190]:
' '.join(df['clean_text'])

TypeError: sequence item 1517: expected str instance, list found

In [140]:
all_words = ' '.join([text for text in df['clean_text']])
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

TypeError: sequence item 0: expected str instance, list found

#### Bag of Words
commonly used for document classification applications where the occurrence of each word is used as a feature for training a classifier.
Source: https://towardsdatascience.com/how-to-build-a-twitter-sentiment-analysis-system-12b28dcbae56

In [139]:
from sklearn.feature_extraction.text import CountVectorizer
bow=CountVectorizer(df['clean_text'], min_df=2,max_features=100000)
bow.fit(df['clean_text'])
tweets_processed = bow.transform(df['clean_text']).toarray()

NotFittedError: Vocabulary not fitted or provided

In [92]:
# a function that takes in a tokenized, cleaned tweet and returns a count vectorized 
# representation of it as a Python dictionary

def count_vectorize(tweet, vocab=None):
    if vocab:
        unique_words = vocab
    else:
        unique_words = list(set(tweet))
    
    tweet_dict = {i:0 for i in unique_words}
    
    for word in tweet:
        tweet_dict[word] += 1
    
    return tweet_dict

test_vectorized = count_vectorize(df['clean_text'][0], vocab=None)
print(test_vectorized)

{'hr': 1, 'iphone': 1, 'upgrade': 1, 'dead': 1, 'plugin': 1, 'station': 1, 'need': 1, 'tweeting': 1, 'g': 1}


In [95]:
df['clean_text'].apply(lambda x: count_vectorize(x))

0       {'hr': 1, 'iphone': 1, 'upgrade': 1, 'dead': 1...
1       {'likely': 1, 'design': 1, 'giving': 1, 't': 1...
2                       {'wait': 1, 'sale': 1, 'also': 1}
3       {'iphone': 1, 'hope': 1, 'year': 2, 'isnt': 1,...
4       {'google': 1, 'great': 1, 'fri': 1, 'matt': 1,...
                              ...                        
9088              {'ipad': 1, 'link': 1, 'everywhere': 1}
9089    {'link': 1, 'programming': 1, 'interrupt': 1, ...
9090    {'google': 1, 'physician': 2, 'zeiger': 1, 'ne...
9091    {'verizon': 1, 'iphone': 1, 'attended': 1, 'co...
9092    {'google': 1, 'test': 1, 'checkin': 1, 'link':...
Name: clean_text, Length: 9093, dtype: object