In [43]:
import pandas as pd
import nltk

nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aungs_c08ycsn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
df = pd.read_csv("data/tweets.csv", encoding='ISO-8859-1')

df.head(3)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion


In [40]:
text = df["tweet_text"]
text

0       .@wesley83 I have a 3G iPhone. After 3 hrs twe...
1       @jessedee Know about @fludapp ? Awesome iPad/i...
2       @swonderlin Can not wait for #iPad 2 also. The...
3       @sxsw I hope this year's festival isn't as cra...
4       @sxtxstate great stuff on Fri #SXSW: Marissa M...
                              ...                        
9088                        Ipad everywhere. #SXSW {link}
9089    Wave, buzz... RT @mention We interrupt your re...
9090    Google's Zeiger, a physician never reported po...
9091    Some Verizon iPhone customers complained their...
9092    Ï¡Ïàü_ÊÎÒ£Áââ_£â_ÛâRT @...
Name: tweet_text, Length: 9093, dtype: object

In [29]:
from nltk.corpus import stopwords

stop_words = stopwords.words("english")
stop_words = [i.replace("'", '') for i in stop_words]

stop_words[-5:]

['werent', 'won', 'wont', 'wouldn', 'wouldnt']

In [33]:
ex = "[^a-zA-Z\s]"


text = text.str.replace(ex, "", regex=True)
text = text.str.lower()
text

0       wesley i have a g iphone after  hrs tweeting a...
1       jessedee know about fludapp  awesome ipadiphon...
2       swonderlin can not wait for ipad  also they sh...
3       sxsw i hope this years festival isnt as crashy...
4       sxtxstate great stuff on fri sxsw marissa maye...
                              ...                        
9088                            ipad everywhere sxsw link
9089    wave buzz rt mention we interrupt your regular...
9090    googles zeiger a physician never reported pote...
9091    some verizon iphone customers complained their...
9092    rt mention google tests checkin offers at sxsw...
Name: tweet_text, Length: 9093, dtype: object

In [34]:
from nltk.tokenize import word_tokenize

tokenized_text = text.apply(lambda t: word_tokenize(str(t)))
tokenized_text

0       [wesley, i, have, a, g, iphone, after, hrs, tw...
1       [jessedee, know, about, fludapp, awesome, ipad...
2       [swonderlin, can, not, wait, for, ipad, also, ...
3       [sxsw, i, hope, this, years, festival, isnt, a...
4       [sxtxstate, great, stuff, on, fri, sxsw, maris...
                              ...                        
9088                       [ipad, everywhere, sxsw, link]
9089    [wave, buzz, rt, mention, we, interrupt, your,...
9090    [googles, zeiger, a, physician, never, reporte...
9091    [some, verizon, iphone, customers, complained,...
9092    [rt, mention, google, tests, checkin, offers, ...
Name: tweet_text, Length: 9093, dtype: object

In [37]:
filtered_text = tokenized_text.apply(lambda x: [word for word in x if word not in stop_words])
filtered_text = filtered_text.apply(lambda x: [word for word in x if len(word) > 1])
filtered_text

0       [wesley, iphone, hrs, tweeting, riseaustin, de...
1       [jessedee, know, fludapp, awesome, ipadiphone,...
2              [swonderlin, wait, ipad, also, sale, sxsw]
3       [sxsw, hope, years, festival, crashy, years, i...
4       [sxtxstate, great, stuff, fri, sxsw, marissa, ...
                              ...                        
9088                       [ipad, everywhere, sxsw, link]
9089    [wave, buzz, rt, mention, interrupt, regularly...
9090    [googles, zeiger, physician, never, reported, ...
9091    [verizon, iphone, customers, complained, time,...
9092    [rt, mention, google, tests, checkin, offers, ...
Name: tweet_text, Length: 9093, dtype: object

In [44]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
lemmatized_text = filtered_text.apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
lemmatized_str = lemmatized_text.apply(lambda x: ' '.join(x))
lemmatized_str

0       wesley iphone hr tweeting riseaustin dead need...
1       jessedee know fludapp awesome ipadiphone app l...
2                     swonderlin wait ipad also sale sxsw
3       sxsw hope year festival crashy year iphone app...
4       sxtxstate great stuff fri sxsw marissa mayer g...
                              ...                        
9088                            ipad everywhere sxsw link
9089    wave buzz rt mention interrupt regularly sched...
9090    google zeiger physician never reported potenti...
9091    verizon iphone customer complained time fell b...
9092       rt mention google test checkin offer sxsw link
Name: tweet_text, Length: 9093, dtype: object

In [50]:
df["processed_text"] = lemmatized_str
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,processed_text
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,wesley iphone hr tweeting riseaustin dead need...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,jessedee know fludapp awesome ipadiphone app l...
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,swonderlin wait ipad also sale sxsw
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,sxsw hope year festival crashy year iphone app...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,sxtxstate great stuff fri sxsw marissa mayer g...


In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words="english", lowercase=True)
tfidf_matrix = vectorizer.fit_transform(lemmatized_str)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])