## 1. Import Packages and Files

In [1]:
import pandas as pd
import numpy as np
import nltk 

pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 100)

In [2]:
df=pd.read_csv('judge-1377884607_tweet_product_company.csv', encoding = 'unicode_escape')

## 2. Data Cleaning

In [3]:
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [4]:
df.emotion_in_tweet_is_directed_at.value_counts()

iPad                               946
Apple                              661
iPad or iPhone App                 470
Google                             430
iPhone                             297
Other Google product or service    293
Android App                         81
Android                             78
Other Apple product or service      35
Name: emotion_in_tweet_is_directed_at, dtype: int64

In [5]:
df.is_there_an_emotion_directed_at_a_brand_or_product.value_counts()

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

In [6]:
df.is_there_an_emotion_directed_at_a_brand_or_product.isna().sum()

0

In [7]:
df.emotion_in_tweet_is_directed_at.isna().sum()

5802

In [8]:
df.shape

(9093, 3)

In [9]:
df.columns=['tweet','company','emotion']

In [10]:
df[df.emotion=='I can\'t tell'].emotion="No emotion toward brand or product"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [11]:
df.emotion.value_counts()

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: emotion, dtype: int64

In [30]:
punctuation=[",",".","#","!","@","$","?","$","%","&","-","_"]
apple=['iPhone','iPad&quot;',"iphone ",'i Pad','i Pad',"Apple.","iPad?","#iPad","#iPhone",'iPad','Mac',"iphone,",'iPod','ipod','ipad','mac','Apple','apple','Apple\'s','apple\'s',"#Apple",'#apple','I-Pad']
google=['Google','google','android','Android','Google.','android,',"#android",'Google\'s','google\'s','googles' ,'#google','#Google']

In [13]:
df.company.unique()

array(['iPhone', 'iPad or iPhone App', 'iPad', 'Google', nan, 'Android',
       'Apple', 'Android App', 'Other Google product or service',
       'Other Apple product or service'], dtype=object)

In [14]:
# Renames emotion's column values
df['emotion'] = df['emotion'].apply(lambda x: "No emotion toward brand or product" if x == "I can't tell" else x)

In [15]:
# Lists with the company's values
apple_products = ['iPhone', 'iPad or iPhone App', 'iPad','Apple','Other Apple product or service']
google_products = ['Google','Android', 'Android App', 'Other Google product or service']

In [16]:
# Checks emotion values
df['emotion'].value_counts()

No emotion toward brand or product    5545
Positive emotion                      2978
Negative emotion                       570
Name: emotion, dtype: int64

In [17]:
# Drops null values from tweet column
df.tweet.dropna(inplace=True)

In [18]:
df[df.company.isna()].head()

Unnamed: 0,tweet,company,emotion
5,@teachntech00 New iPad Apps For #SpeechTherapy...,,No emotion toward brand or product
6,,,No emotion toward brand or product
16,Holler Gram for iPad on the iTunes App Store -...,,No emotion toward brand or product
32,"Attn: All #SXSW frineds, @mention Register fo...",,No emotion toward brand or product
33,Anyone at #sxsw want to sell their old iPad?,,No emotion toward brand or product


In [19]:
df[df.emotion=='No emotion toward brand or product'].company.value_counts()

iPad                               28
Apple                              23
Google                             16
Other Google product or service    10
iPad or iPhone App                 10
iPhone                             10
Android                             1
Android App                         1
Other Apple product or service      1
Name: company, dtype: int64

In [20]:
df[df.emotion=='No emotion toward brand or product'].company.isna().sum()

5445

In [21]:
for index in list(df[df.company.isna()].tweet.dropna().index):
    tweet = ''.join(i for i in df.tweet[index] if not i in punctuation)
    if bool(set(tweet.split())&set(apple)) == True:
        df.company[index]=list(set(tweet.split())&set(apple))[0]
    if bool(set(tweet.split())&set(google)) == True:
        df.company[index]=list(set(tweet.split())&set(google))[0]  

In [22]:
df['brand'] = df['company'].apply(lambda x: 'Apple' if x in apple_products else 'Google' if x in google_products else '')

In [23]:
df['brand'].value_counts()

Apple     4792
Google    2327
          1974
Name: brand, dtype: int64

### Remove Non-English Words

In [24]:
df.tweet[1]

"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW"

In [25]:
words = set(nltk.corpus.words.words())

sent = df.tweet[1]
" ".join(w for w in nltk.wordpunct_tokenize(sent) \
         if w.lower() in words or not w.isalpha())

"@ Know about @ ? Awesome / that you ' likely appreciate for its design . Also , they ' re giving free at #"

## Tokenization

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords 
from nltk.tokenize import RegexpTokenizer

stopwords = ['am','he','i','the','hi']

vectorizer = TfidfVectorizer(ngram_range=(1,1),stop_words=stopwords, token_pattern=(r'[a-zA-Z]+'))

text_tf = vectorizer.fit_transform(df['tweet'].apply(lambda x: np.str_(x)))

# text_tf = vectorizer.fit_transform(df['tweet'])
text_tf.data

array([0.03746796, 0.33746827, 0.31834971, ..., 0.18673792, 0.08104572,
       0.17591944])

In [28]:
# df[['company','emotion','tweet']].groupby(['company','emotion']).count()

In [29]:
df.head()

Unnamed: 0,tweet,company,emotion,brand
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,Apple
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,Apple
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,Apple
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,Apple
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,Google


In [None]:
# # We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
# from sklearn.feature_extraction.text import CountVectorizer

# cv = CountVectorizer(stop_words='english')
# data_cv = cv.fit_transform(df.tweet)
# data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
# data_dtm.index = data_clean.index
# data_dtm

In [None]:
text_tf

In [None]:
pd.DataFrame(text_tf).head()

In [None]:
df = pd.DataFrame(text_tf.todense(),columns = vectorizer.get_feature_names())
df