In [1]:
# notebook for module 4 project - Kai Graham

In [2]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# set random seed
np.random.seed(seed=23)

In [4]:
# need to think about validation and train-test splits

In [5]:
# load dataset and begin exploring
df = pd.read_csv('judge-1377884607_tweet_product_company.csv', encoding='latin_1')
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [6]:
# as we can see above, we have successfully loaded the dataset
# further information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


In [7]:
# rename columns so they are easier to work with 
df.columns = ['text', 'product', 'emotion']
df.head()

Unnamed: 0,text,product,emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [8]:
# check for missing values
df.isna().sum()

text          1
product    5802
emotion       0
dtype: int64

In [9]:
# there appear to be quite a bit of missing product entries - examine further
missing_products = df.loc[df['product'].isna()]
missing_products.head()

Unnamed: 0,text,product,emotion
5,@teachntech00 New iPad Apps For #SpeechTherapy...,,No emotion toward brand or product
6,,,No emotion toward brand or product
16,Holler Gram for iPad on the iTunes App Store -...,,No emotion toward brand or product
32,"Attn: All #SXSW frineds, @mention Register fo...",,No emotion toward brand or product
33,Anyone at #sxsw want to sell their old iPad?,,No emotion toward brand or product


In [10]:
# see if there are any entries not listed as no emotion toward brand or product
missing_products['emotion'].unique()

array(['No emotion toward brand or product', 'Positive emotion',
       'Negative emotion', "I can't tell"], dtype=object)

In [11]:
missing_products['emotion'].value_counts()

No emotion toward brand or product    5298
Positive emotion                       306
I can't tell                           147
Negative emotion                        51
Name: emotion, dtype: int64

In [12]:
# examine the one missing text entry
df.loc[df['text'].isna()]

Unnamed: 0,text,product,emotion
6,,,No emotion toward brand or product


In [13]:
# looks like no text, we will drop this entry
clean_df = df.dropna(subset=['text'])
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9092 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     9092 non-null   object
 1   product  3291 non-null   object
 2   emotion  9092 non-null   object
dtypes: object(3)
memory usage: 284.1+ KB


In [14]:
# remove all I can't tell from the dataset as we don't have proper labels for these
clean_df = clean_df.loc[clean_df['emotion'] != "I can't tell"]
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8936 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     8936 non-null   object
 1   product  3282 non-null   object
 2   emotion  8936 non-null   object
dtypes: object(3)
memory usage: 279.2+ KB


In [15]:
# for the time being we will ignore the product column as we are only focused on 
# emotion of the texts - drop the product column 
clean_df = clean_df.drop(['product'], axis=1)
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8936 entries, 0 to 9092
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     8936 non-null   object
 1   emotion  8936 non-null   object
dtypes: object(2)
memory usage: 209.4+ KB


In [16]:
clean_df.head()

Unnamed: 0,text,emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion


In [17]:
# check if any further missing values or duplicates
clean_df.isna().any()

text       False
emotion    False
dtype: bool

In [18]:
# check duplicates
clean_df.duplicated().sum()

22

In [19]:
# remove duplicates as there are only 22 in our dataset
clean_df = clean_df.drop_duplicates()

In [20]:
# check it worked
clean_df.duplicated().any()

False

In [21]:
# no more duplicates -- good to move on to the next stage

In [22]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8914 entries, 0 to 9092
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     8914 non-null   object
 1   emotion  8914 non-null   object
dtypes: object(2)
memory usage: 208.9+ KB


In [23]:
# we have 8914 records remaining, see how may are listed as neutral
clean_df['emotion'].value_counts()

No emotion toward brand or product    5375
Positive emotion                      2970
Negative emotion                       569
Name: emotion, dtype: int64

In [24]:
# looks like the majority are no emotion toward brand or product, but to begin
# we will focus just on building a binary nlp model
# drop entries listed as no emotion
binary_clean_df = clean_df.loc[clean_df['emotion'] != 'No emotion toward brand or product']
binary_clean_df['emotion'].value_counts()

Positive emotion    2970
Negative emotion     569
Name: emotion, dtype: int64

In [39]:
# split dataset into train and test set
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(binary_clean_df)

In [40]:
# split into data and target
train_data = train_df['text']
train_target = train_df['emotion']

In [41]:
# import necessary libraries
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
import string

In [42]:
# pull in stop words from english language
stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``']

In [43]:
# create function to process a single tweet
def process_tweet(tweet):
    """
    Input: tweet of type str
    Function tokenizes tweet using function from nltk
    Lowercase every token, remove any stopwords found in stopwords_list from the tokenized article, 
    and return the results
    """
    tokens = nltk.word_tokenize(tweet)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    return stopwords_removed

In [44]:
# use map function to call process_tweet on our data
processed_data = list(map(process_tweet, train_data))

In [58]:
processed_data[:5]

[['think',
  'fell',
  'bit',
  'love',
  'google',
  'today',
  'thanks',
  'throwing',
  'nerd',
  'party',
  'speakeasy',
  'sxsw'],
 ['exclusive', 'shot', 'sxsw', 'popup', 'apple', 'store', 'link'],
 ['save',
  'cash',
  'techcrunch',
  'giveaway',
  'ipad',
  '2åê',
  'techcrunch',
  'link',
  'via',
  'mention',
  'winning',
  'ipad2',
  'sxsw',
  'apple'],
 ['want',
  'time',
  'said',
  'ipad',
  'rt',
  'mention',
  'schedule',
  'heavier',
  'laptop',
  "'s",
  'ipad',
  'already',
  'sxsw'],
 ['rt',
  'mention',
  'google',
  'launching',
  'check',
  'service',
  'sxsw',
  'providing',
  'deals',
  'fmsignal',
  'sxswi']]

In [57]:
train_data.head()

494     I think I fell a bit more in love with #google...
1082    exclusive shot of #SXSW popup Apple Store! {link}
5015    save me some cash! TechCrunch Giveaway: An iPa...
4291    I want some time with this said iPad! RT @ment...
5840    RT @mention Google launching check in service ...
Name: text, dtype: object

In [59]:
# looks like our tokenizing worked properly, as well as the removal of some stop words

In [60]:
# get total vocabulary size of our training set
total_vocab = set()
for tweet in processed_data:
    total_vocab.update(tweet)
len(total_vocab)

5374

In [61]:
# total number of unique words in our training set is 5374