## Contents

1. [Preprocessing Tweets](#preprocessing-tweets)

In [35]:
import pandas as pd
import nltk

nltk.download("wordnet")
nltk.download("stopwords")
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('words');

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aungs_tko91wk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aungs_tko91wk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aungs_tko91wk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\aungs_tko91wk\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\aungs_tko91wk\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


## Preprocessing Tweets

In [36]:
df = pd.read_csv("data/tweets.csv", encoding='ISO-8859-1')

df.columns = ["unprocessed_tweet", "product", "emotion"]

df.head(3)

Unnamed: 0,unprocessed_tweet,product,emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion


In [37]:
text = df["unprocessed_tweet"]
text

0       .@wesley83 I have a 3G iPhone. After 3 hrs twe...
1       @jessedee Know about @fludapp ? Awesome iPad/i...
2       @swonderlin Can not wait for #iPad 2 also. The...
3       @sxsw I hope this year's festival isn't as cra...
4       @sxtxstate great stuff on Fri #SXSW: Marissa M...
                              ...                        
9088                        Ipad everywhere. #SXSW {link}
9089    Wave, buzz... RT @mention We interrupt your re...
9090    Google's Zeiger, a physician never reported po...
9091    Some Verizon iPhone customers complained their...
9092    Ï¡Ïàü_ÊÎÒ£Áââ_£â_ÛâRT @...
Name: unprocessed_tweet, Length: 9093, dtype: object

In [38]:
from nltk.corpus import stopwords

stop_words = stopwords.words("english")
stop_words = [i.replace("'", '') for i in stop_words]

top_words = ['sxsw', 'mention', 'link', 'rt']
stop_words = stop_words + top_words

In [39]:
ex = "[^a-zA-Z\s]"

text = text.str.replace(ex, "", regex=True)
text = text.str.lower()
text

0       wesley i have a g iphone after  hrs tweeting a...
1       jessedee know about fludapp  awesome ipadiphon...
2       swonderlin can not wait for ipad  also they sh...
3       sxsw i hope this years festival isnt as crashy...
4       sxtxstate great stuff on fri sxsw marissa maye...
                              ...                        
9088                            ipad everywhere sxsw link
9089    wave buzz rt mention we interrupt your regular...
9090    googles zeiger a physician never reported pote...
9091    some verizon iphone customers complained their...
9092    rt mention google tests checkin offers at sxsw...
Name: unprocessed_tweet, Length: 9093, dtype: object

In [40]:
from nltk.tokenize import word_tokenize

tokenized_text = text.apply(lambda t: word_tokenize(str(t)))
tokenized_text

0       [wesley, i, have, a, g, iphone, after, hrs, tw...
1       [jessedee, know, about, fludapp, awesome, ipad...
2       [swonderlin, can, not, wait, for, ipad, also, ...
3       [sxsw, i, hope, this, years, festival, isnt, a...
4       [sxtxstate, great, stuff, on, fri, sxsw, maris...
                              ...                        
9088                       [ipad, everywhere, sxsw, link]
9089    [wave, buzz, rt, mention, we, interrupt, your,...
9090    [googles, zeiger, a, physician, never, reporte...
9091    [some, verizon, iphone, customers, complained,...
9092    [rt, mention, google, tests, checkin, offers, ...
Name: unprocessed_tweet, Length: 9093, dtype: object

In [41]:
from nltk.tag import pos_tag

filtered_text = tokenized_text.apply(lambda x: [
    word for word in x if word not in stop_words]
)

filtered_text = filtered_text.apply(lambda x: [
    word for word in x if len(word) > 1]
)

filtered_text

0       [wesley, iphone, hrs, tweeting, riseaustin, de...
1       [jessedee, know, fludapp, awesome, ipadiphone,...
2                    [swonderlin, wait, ipad, also, sale]
3       [hope, years, festival, crashy, years, iphone,...
4       [sxtxstate, great, stuff, fri, marissa, mayer,...
                              ...                        
9088                                   [ipad, everywhere]
9089    [wave, buzz, interrupt, regularly, scheduled, ...
9090    [googles, zeiger, physician, never, reported, ...
9091    [verizon, iphone, customers, complained, time,...
9092                     [google, tests, checkin, offers]
Name: unprocessed_tweet, Length: 9093, dtype: object

In [42]:
tagged_text = filtered_text.apply(lambda x: pos_tag(x))

tagged_text

0       [(wesley, NN), (iphone, NN), (hrs, NN), (tweet...
1       [(jessedee, NN), (know, VBP), (fludapp, VBZ), ...
2       [(swonderlin, NN), (wait, NN), (ipad, NN), (al...
3       [(hope, NN), (years, NNS), (festival, JJ), (cr...
4       [(sxtxstate, NN), (great, JJ), (stuff, NN), (f...
                              ...                        
9088                       [(ipad, NN), (everywhere, RB)]
9089    [(wave, NN), (buzz, NN), (interrupt, VBP), (re...
9090    [(googles, NNS), (zeiger, RBR), (physician, JJ...
9091    [(verizon, NN), (iphone, NN), (customers, NNS)...
9092    [(google, NN), (tests, NNS), (checkin, VBP), (...
Name: unprocessed_tweet, Length: 9093, dtype: object

In [43]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

lemmatized_text = tagged_text.apply(
    lambda x: [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in x]
)

lemmatized_str = lemmatized_text.apply(lambda x: ' '.join(x))

lemmatized_str

0       wesley iphone hr tweet riseaustin dead need up...
1       jessedee know fludapp awesome ipadiphone app l...
2                          swonderlin wait ipad also sale
3               hope year festival crashy year iphone app
4       sxtxstate great stuff fri marissa mayer google...
                              ...                        
9088                                      ipad everywhere
9089    wave buzz interrupt regularly schedule geek pr...
9090    google zeiger physician never report potential...
9091    verizon iphone customer complain time fell bac...
9092                            google test checkin offer
Name: unprocessed_tweet, Length: 9093, dtype: object

In [44]:
df["processed_tweet"] = lemmatized_str
df.head()

Unnamed: 0,unprocessed_tweet,product,emotion,processed_tweet
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,wesley iphone hr tweet riseaustin dead need up...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,jessedee know fludapp awesome ipadiphone app l...
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,swonderlin wait ipad also sale
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,hope year festival crashy year iphone app
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,sxtxstate great stuff fri marissa mayer google...


In [45]:
df.isnull().sum()

unprocessed_tweet       1
product              5802
emotion                 0
processed_tweet         0
dtype: int64

In [48]:
df.to_csv("data/processed_tweets.csv")

In [33]:
df_binary = df.copy()
df_binary = df_binary[df_binary["emotion"].isin(["Negative emotion", "Positive emotion"])]

df_binary["emotion_encoded"] = df_binary["emotion"].replace("Negative emotion", 0).replace("Positive emotion", 1)
df_binary.to_csv("data/processed_tweets_binary.csv")

In [34]:
len(df['processed_tweet'][df['processed_tweet'].str.contains('html')])

34

---