# This version will preprocess the data and save it for other versions to load

import files

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer
from nltk import FreqDist
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
import nltk
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import sent_tokenize
from operator import itemgetter
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import ConfusionMatrixDisplay
from pathlib import Path

load

In [2]:
df = pd.read_csv('../data/tweets.csv', encoding='unicode_escape')

explore

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


rename

In [4]:
df.rename(columns={'tweet_text': 'text',
                   'emotion_in_tweet_is_directed_at': 'company',
                   'is_there_an_emotion_directed_at_a_brand_or_product': 'sentiment'},
          inplace = True)

look at missing values

In [5]:
df[df.text.isna()]

Unnamed: 0,text,company,sentiment
6,,,No emotion toward brand or product


can't do anything without the text of the tweet, so drop

In [6]:
df.dropna(subset=['text'], inplace=True)

check duplicates

In [7]:
df.duplicated().value_counts()

False    9070
True       22
Name: count, dtype: int64

drop duplicates, just text, doesn't matter if same text with different sentiment, etc. (still drop)

In [8]:
df.drop_duplicates(subset=['text'], inplace=True)

edit, simplify, rename

In [9]:
df.sentiment.value_counts()

sentiment
No emotion toward brand or product    5372
Positive emotion                      2968
Negative emotion                       569
I can't tell                           156
Name: count, dtype: int64

simplify sentiment into binary, reduce class imbalance

In [10]:
df['sentiment'].replace({'No emotion toward brand or product': 0,
                         'Positive emotion': 1,
                         'Negative emotion': 0,
                         "I can't tell": 0
                        }, inplace=True)

In [11]:
df.sentiment.value_counts()

sentiment
0    6097
1    2968
Name: count, dtype: int64

look at company

In [12]:
df.company.value_counts()

company
iPad                               943
Apple                              659
iPad or iPhone App                 469
Google                             428
iPhone                             296
Other Google product or service    293
Android App                         80
Android                             77
Other Apple product or service      35
Name: count, dtype: int64

In [13]:
df['company'].replace(['iPad', 'Apple', 'iPad or iPhone App', 'iPhone', 'Other Apple product or service'], 'apple',
                     inplace=True)
df['company'].replace(['Google', 'Other Google product or service', 'Android App', 'Android'], 'google',
                     inplace=True)
df['company'].fillna('other',
                    inplace=True)

In [14]:
df.company.value_counts()

company
other     5785
apple     2402
google     878
Name: count, dtype: int64

deal with missing company
missing company values are informed by the text, and the text should be all lower case to simplify this
no big deal because we want all lower case for train and test anyway

In [15]:
df['text'] = df['text'].str.lower()

In [16]:
apple_words = ['ipad', 'apple', 'iphone', 'itunes', 'ipad2']
google_words = ['google', 'android', 'blogger']

basic_token_pattern = r"(?u)\b\w\w+\b"
tokenizer = RegexpTokenizer(basic_token_pattern)

def company_fix(text, company):
    if company != 'other':
        return company
    else:
        apple, google = False, False
        text_tokenized = tokenizer.tokenize(text)
        for word in apple_words:
            if word in text_tokenized:
                apple = True
                break
        for word in google_words:
            if word in text_tokenized:
                google = True
                break
        if apple & ~google:
            return 'apple'
        elif google & ~apple:
            return 'google'
        elif apple & google:
            return 'both'
        else:
            return 'neither'

df['company'] = df.apply(lambda x: company_fix(x.text, x.company), axis=1)

In [17]:
df.company.value_counts()

company
apple      5390
google     2783
neither     716
both        176
Name: count, dtype: int64

could do more here to explore the neither and both values

move on to language processing

train-test split

In [19]:
df.head()

Unnamed: 0,text,company,sentiment
0,.@wesley83 i have a 3g iphone. after 3 hrs twe...,apple,0
1,@jessedee know about @fludapp ? awesome ipad/i...,apple,1
2,@swonderlin can not wait for #ipad 2 also. the...,apple,1
3,@sxsw i hope this year's festival isn't as cra...,apple,0
4,@sxtxstate great stuff on fri #sxsw: marissa m...,google,1


In [18]:
filepath = Path('../data/df.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df.to_csv(filepath)