In [34]:
# notebook for module 4 project - Kai Graham

In [35]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [36]:
# set random seed
np.random.seed(seed=23)

In [37]:
# need to think about validation and train-test splits

In [38]:
# load dataset and begin exploring
df = pd.read_csv('judge-1377884607_tweet_product_company.csv', encoding='latin_1')
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [39]:
# as we can see above, we have successfully loaded the dataset
# further information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


In [40]:
# rename columns so they are easier to work with 
df.columns = ['text', 'product', 'emotion']
df.head()

Unnamed: 0,text,product,emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [41]:
# check for missing values
df.isna().sum()

text          1
product    5802
emotion       0
dtype: int64

In [42]:
# there appear to be quite a bit of missing product entries - examine further
missing_products = df.loc[df['product'].isna()]
missing_products.head()

Unnamed: 0,text,product,emotion
5,@teachntech00 New iPad Apps For #SpeechTherapy...,,No emotion toward brand or product
6,,,No emotion toward brand or product
16,Holler Gram for iPad on the iTunes App Store -...,,No emotion toward brand or product
32,"Attn: All #SXSW frineds, @mention Register fo...",,No emotion toward brand or product
33,Anyone at #sxsw want to sell their old iPad?,,No emotion toward brand or product


In [43]:
# see if there are any entries not listed as no emotion toward brand or product
missing_products['emotion'].unique()

array(['No emotion toward brand or product', 'Positive emotion',
       'Negative emotion', "I can't tell"], dtype=object)

In [44]:
missing_products['emotion'].value_counts()

No emotion toward brand or product    5298
Positive emotion                       306
I can't tell                           147
Negative emotion                        51
Name: emotion, dtype: int64

In [45]:
# examine the one missing text entry
df.loc[df['text'].isna()]

Unnamed: 0,text,product,emotion
6,,,No emotion toward brand or product


In [46]:
# looks like no text, we will drop this entry
clean_df = df.dropna(subset=['text'])
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9092 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     9092 non-null   object
 1   product  3291 non-null   object
 2   emotion  9092 non-null   object
dtypes: object(3)
memory usage: 284.1+ KB


In [47]:
# remove all I can't tell from the dataset as we don't have proper labels for these
clean_df = clean_df.loc[clean_df['emotion'] != "I can't tell"]
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8936 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     8936 non-null   object
 1   product  3282 non-null   object
 2   emotion  8936 non-null   object
dtypes: object(3)
memory usage: 279.2+ KB


In [48]:
# for the time being we will ignore the product column as we are only focused on 
# emotion of the texts - drop the product column 
clean_df = clean_df.drop(['product'], axis=1)
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8936 entries, 0 to 9092
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     8936 non-null   object
 1   emotion  8936 non-null   object
dtypes: object(2)
memory usage: 209.4+ KB


In [49]:
clean_df.head()

Unnamed: 0,text,emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion


In [50]:
# check if any further missing values or duplicates
clean_df.isna().any()

text       False
emotion    False
dtype: bool

In [51]:
# check duplicates
clean_df.duplicated().sum()

22

In [52]:
# remove duplicates as there are only 22 in our dataset
clean_df = clean_df.drop_duplicates()

In [53]:
# check it worked
clean_df.duplicated().any()

False

In [54]:
# no more duplicates -- good to move on to the next stage

In [55]:
# split dataset into train and test set
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(clean_df)

In [56]:
# need to think about validation and train-test splits

In [57]:
# print length of train and test data
print(len(train_data))
print(len(test_data))

6685
2229


In [58]:
train_data.head()

Unnamed: 0,text,emotion
6453,RT @mention Preparing for #sxsw talk about #go...,Positive emotion
4150,#SXSW Go let's you rate &amp; review sessions ...,Negative emotion
3122,Session next - Designing iPad Interfaces - New...,No emotion toward brand or product
6979,RT @mention Yeah! Love the Google! Cheese w/ @...,Positive emotion
7527,[SoftLayer Blog] SoftLayer #SxSW Server Challe...,Positive emotion


In [60]:
# start cleaning datasets - - stop words, lemmatizing, etc.