import files

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer
from nltk import FreqDist
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
import nltk
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import sent_tokenize
from operator import itemgetter
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import ConfusionMatrixDisplay

load

In [2]:
df = pd.read_csv('../data/tweets.csv', encoding='unicode_escape')

explore

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


rename

In [4]:
df.rename(columns={'tweet_text': 'text',
                   'emotion_in_tweet_is_directed_at': 'company',
                   'is_there_an_emotion_directed_at_a_brand_or_product': 'sentiment'},
          inplace = True)

look at missing values

In [5]:
df[df.text.isna()]

Unnamed: 0,text,company,sentiment
6,,,No emotion toward brand or product


can't do anything without the text of the tweet, so drop

In [6]:
df.dropna(subset=['text'], inplace=True)

check duplicates

In [7]:
df.duplicated().value_counts()

False    9070
True       22
Name: count, dtype: int64

drop duplicates, just text, doesn't matter if same text with different sentiment, etc. (still drop)

In [8]:
df.drop_duplicates(subset=['text'], inplace=True)

edit, simplify, rename

In [9]:
df.sentiment.value_counts()

sentiment
No emotion toward brand or product    5372
Positive emotion                      2968
Negative emotion                       569
I can't tell                           156
Name: count, dtype: int64

simplify sentiment into binary, reduce class imbalance

In [10]:
df['sentiment'].replace({'No emotion toward brand or product': 0,
                         'Positive emotion': 1,
                         'Negative emotion': 0,
                         "I can't tell": 0
                        }, inplace=True)

In [11]:
df.sentiment.value_counts()

sentiment
0    6097
1    2968
Name: count, dtype: int64

look at company

In [12]:
df.company.value_counts()

company
iPad                               943
Apple                              659
iPad or iPhone App                 469
Google                             428
iPhone                             296
Other Google product or service    293
Android App                         80
Android                             77
Other Apple product or service      35
Name: count, dtype: int64

In [13]:
df['company'].replace(['iPad', 'Apple', 'iPad or iPhone App', 'iPhone', 'Other Apple product or service'], 'apple',
                     inplace=True)
df['company'].replace(['Google', 'Other Google product or service', 'Android App', 'Android'], 'google',
                     inplace=True)
df['company'].fillna('other',
                    inplace=True)

In [14]:
df.company.value_counts()

company
other     5785
apple     2402
google     878
Name: count, dtype: int64

deal with missing company
missing company values are informed by the text, and the text should be all lower case to simplify this
no big deal because we want all lower case for train and test anyway

In [15]:
df['text'] = df['text'].str.lower()

In [16]:
apple_words = ['ipad', 'apple', 'iphone', 'itunes', 'ipad2']
google_words = ['google', 'android', 'blogger']

basic_token_pattern = r"(?u)\b\w\w+\b"
tokenizer = RegexpTokenizer(basic_token_pattern)

def company_fix(text, company):
    if company != 'other':
        return company
    else:
        apple, google = False, False
        text_tokenized = tokenizer.tokenize(text)
        for word in apple_words:
            if word in text_tokenized:
                apple = True
                break
        for word in google_words:
            if word in text_tokenized:
                google = True
                break
        if apple & ~google:
            return 'apple'
        elif google & ~apple:
            return 'google'
        elif apple & google:
            return 'both'
        else:
            return 'neither'

df['company'] = df.apply(lambda x: company_fix(x.text, x.company), axis=1)

In [17]:
df.company.value_counts()

company
apple      5390
google     2783
neither     716
both        176
Name: count, dtype: int64

could do more here to explore the neither and both values

move on to language processing

train-test split

In [18]:
X, y = df['text'].to_frame(), df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)

add label to X_train for research purposes .. obviously don't include this in the model

reset index to anticipate future problems ... or not reset the index???

In [19]:
X_train.loc[:, 'label'] = [y_train.loc[val] for val in X_train.index]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.loc[:, 'label'] = [y_train.loc[val] for val in X_train.index]


perfunctory exploring should happen here

top ten visualizations for pos. and non-pos.

size of vocabulary

more?

In [20]:
X_train.loc[:, 'text_tokenized'] = X_train['text'].apply(tokenizer.tokenize)
vocab_raw = set(X_train['text_tokenized'].explode())
print('Size of raw vocabulary:', len(vocab_raw))   

Size of raw vocabulary: 8876


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.loc[:, 'text_tokenized'] = X_train['text'].apply(tokenizer.tokenize)


gonna need naive bayes, might not do any other models (markov, etc.)

In [21]:
baseline_model = MultinomialNB()

look at plurality winner to see score to beat

In [22]:
plurality_cv = round(y_train.value_counts(normalize=True)[0],4)
y_train.value_counts(normalize=True)

sentiment
0    0.672366
1    0.327634
Name: proportion, dtype: float64

first model, just ten features

In [23]:
tfidf = TfidfVectorizer(
    max_features = 10
)

X_train_vectorized = tfidf.fit_transform(X_train['text'])

baseline_cv = round(cross_val_score(baseline_model, X_train_vectorized, y_train).mean(),4)

print('Plurality:', plurality_cv,
      '\nBaseline: ',baseline_cv)

Plurality: 0.6724 
Baseline:  0.6724


an absolutely miniscule improvement

let's try all words, not just max_features = 10

In [27]:
tfidf = TfidfVectorizer()

X_train_vectorized = tfidf.fit_transform(X_train['text'])

all_words_cv = round(cross_val_score(baseline_model, X_train_vectorized, y_train).mean(),4)

print('Plurality:', plurality_cv,
      '\nBaseline: ', baseline_cv,
      '\nAll Words:', all_words_cv)

Plurality: 0.6724 
Baseline:  0.6724 
All Words: 0.7005


an actual improvement

let's look at which 10 terms were least and most associated with positive sentiment

In [24]:
# instantiate the vectorizer
tfidf = TfidfVectorizer()
# fit the vectorizer on X_train and transform it
X_train_vectorized = tfidf.fit_transform(X_train['text'])

# create array of the word list from this vectorizer with new index
feature_names = np.array(tfidf.get_feature_names())
# create array of the indices of the feature_names array, ordered by tfidf score
sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()
# create frequency distribution (dictionary) of 1-grams
all_words_freq_dist = FreqDist(X_train['text_tokenized'].explode())

In [44]:
# show the words with the top 10 tfidf values, and their tfidf values
for n in range(-1,-11,-1):
    print(round(X_train_vectorized.max(0).toarray()[0][sorted_tfidf_index[n]],4),
          feature_names[sorted_tfidf_index[n]]
         )      

0.9622 worship
0.9278 rocks
0.9184 hmmmm
0.9014 covet
0.8927 orly
0.8842 location
0.8691 charging
0.8686 whoooooo
0.8622 applestore
0.8585 deleting


In [50]:
# show all instances of this word
X_train[X_train['text'].str.contains('worship')]

Unnamed: 0,text,label,text_tokenized
77,i worship @mention {link} #sxsw,0,"[worship, mention, link, sxsw]"


In [36]:
print('Smallest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

Smallest tfidf:
['á¾_î¾ð' '_ã' '_ô' 'documents' '¼¼' 'çü' 'öý' 'èï' 'sxsw' 'karaoke']

Largest tfidf: 
['worship' 'rocks' 'hmmmm' 'covet' 'orly' 'location' 'charging' 'whoooooo'
 'applestore' 'deleting']


we can and will explore stopwords, but it seems clear we can stem or lemmatize

In [51]:
stemmer = SnowballStemmer(language="english")

def stem_and_tokenize(document):
    tokens = tokenizer.tokenize(document)
    return [stemmer.stem(token) for token in tokens]

create stemmed vocabulary

In [52]:
X_train.loc[:, 'text_stemmed'] = X_train.loc[:, 'text'].apply(stem_and_tokenize)
vocab_stemmed = set(X_train['text_stemmed'].explode())
print('Size of raw vocabulary:    ', len(vocab_raw))
print('Size of stemmed vocabulary:', len(vocab_stemmed))

Size of raw vocabulary:     8876
Size of stemmed vocabulary: 7016


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.loc[:, 'text_stemmed'] = X_train.loc[:, 'text'].apply(stem_and_tokenize)


In [53]:
tfidf = TfidfVectorizer(
    tokenizer = stem_and_tokenize
)

X_train_vectorized = tfidf.fit_transform(X_train['text'])

stemmed_words_cv = round(cross_val_score(baseline_model, X_train_vectorized, y_train).mean(),4)

print('Plurality:    ', plurality_cv,
      '\nBaseline:     ', baseline_cv,
      '\nAll Words:    ', all_words_cv,
      '\nStemmed Words:', stemmed_words_cv
     )

Plurality:     0.6724 
Baseline:      0.6724 
All Words:     0.7005 
Stemmed Words: 0.6995


Stemming is worse by about one tenth of a percent

top 10 terms

In [54]:
feature_names = np.array(tfidf.get_feature_names())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

Smallest tfidf:
['¼¼' 'á¾_î¾ð' 'çü' 'èï' 'öý' '_ô' '_ã' 'primo' 'nowher' 'visto']

Largest tfidf: 
['worship' 'hmmmm' 'rock' 'covet' 'locat' 'whoooooo' 'applestor' 'money'
 'delet' 'atx']


In [55]:
lemmatizer = WordNetLemmatizer()

def lemmatize_and_tokenize(document):
    tokens = tokenizer.tokenize(document)
    return [lemmatizer.lemmatize(token) for token in tokens]

In [56]:
X_train.loc[:, 'text_lemmatized'] = X_train.loc[:, 'text'].apply(lemmatize_and_tokenize)
vocab_lemmatized = set(X_train['text_lemmatized'].explode())
print('Size of raw vocabulary:       ', len(vocab_raw),
      '\nSize of stemmed vocabulary:   ', len(vocab_stemmed),
      '\nSize of lemmatized vocabulary:', len(vocab_lemmatized))

Size of raw vocabulary:        8876 
Size of stemmed vocabulary:    7016 
Size of lemmatized vocabulary: 8208


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.loc[:, 'text_lemmatized'] = X_train.loc[:, 'text'].apply(lemmatize_and_tokenize)


In [57]:
tfidf = TfidfVectorizer(
    tokenizer = lemmatize_and_tokenize
)

X_train_vectorized = tfidf.fit_transform(X_train['text'])

lemmatized_words_cv = round(cross_val_score(baseline_model, X_train_vectorized, y_train).mean(),4)

print('Plurality:       ', plurality_cv,
      '\nAll Words:       ', all_words_cv,
      '\nStemmed Words:   ', stemmed_words_cv,
      '\nLemmatized Words:', lemmatized_words_cv
     )

Plurality:        0.6724 
All Words:        0.7005 
Stemmed Words:    0.6995 
Lemmatized Words: 0.6984


lemmatizing makes it worse by another tenth of a percent

look at top 10

In [58]:
feature_names = np.array(tfidf.get_feature_names())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

Smallest tfidf:
['document' '_ã' 'öý' '_ô' 'èï' 'çü' 'á¾_î¾ð' '¼¼' 'primo' 'visto']

Largest tfidf: 
['worship' 'hmmmm' 'rock' 'covet' 'orly' 'location' 'whoooooo' 'charging'
 'applestore' 'deleting']


explore bigrams

let's try n-grams, n from 2 to 7, using all words

In [59]:
def make_ngrams(word_list, n):
    length = len(word_list)
    if length < n:
        return None
    else:
        ngram_list = []
        for i in range(length - n + 1):
            ngram = ''
            for j in range(i, i+n):
                if j > i:
                    ngram += ' '
                ngram += word_list[j]
            ngram_list.append(ngram)
        return ngram_list

In [76]:
n = 4

tfidf = TfidfVectorizer(
    ngram_range = (n,n)
)

X_train_vectorized = tfidf.fit_transform(X_train['text'])

score = round(cross_val_score(baseline_model, X_train_vectorized, y_train).mean(),4)

feature_names = np.array(tfidf.get_feature_names())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()



# create array of the word list from this vectorizer with new index
feature_names = np.array(tfidf.get_feature_names())
# create array of the indices of the feature_names array, ordered by tfidf score
sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

X_train.loc[:, str(n)+'_grams'] = X_train.loc[:, 'text_tokenized'].apply(lambda x: make_ngrams(x, n))

# create frequency distribution (dictionary) of 1-grams
bigrams_freq_dist = FreqDist(X_train['text_tokenized'].explode())

bigrams = X_train.loc[:, str(n)+'_grams'].explode()
bigrams_freq_dist = FreqDist(bigrams)
    
#     smallest.append(feature_names[sorted_tfidf_index[:10]])
#     largest.append(feature_names[sorted_tfidf_index[:-11:-1]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.loc[:, str(n)+'_grams'] = X_train.loc[:, 'text_tokenized'].apply(lambda x: make_ngrams(x, n))


In [77]:
# show the words with the top 10 tfidf values, and their tfidf values
for n in range(-1,-11,-1):
    print(round(X_train_vectorized.max(0).toarray()[0][sorted_tfidf_index[n]],4),
          feature_names[sorted_tfidf_index[n]]
         )      

1.0 mention sxsw ipad rocks
1.0 my sxsw iphone screen
1.0 worship mention link sxsw
1.0 essential sxsw tools link
1.0 iphone sharing sxsw shareable
1.0 google circles sxsw orly
0.8038 at apple store at
0.7342 ipad line sxsw link
0.7229 in hand sxsw thisisdare
0.7229 covet new ipad link


In [None]:
smallest, largest = [], []

for n in range(1,8):
    
    if n > 1:
        tfidf = TfidfVectorizer(
            ngram_range = (n,n)
        )
    else:
        tfidf = TfidfVectorizer()
    
    X_train_vectorized = tfidf.fit_transform(X_train['text'])
    
    score = round(cross_val_score(baseline_model, X_train_vectorized, y_train).mean(),4)
    
    print(str(n)+'-gram', 'score:', score)

    feature_names = np.array(tfidf.get_feature_names())

    sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()
    
    smallest.append(feature_names[sorted_tfidf_index[:10]])
    largest.append(feature_names[sorted_tfidf_index[:-11:-1]])

top ten only looks at top ten most frequent, so this is useless until you give it a stop words list, probably not useful at all

now i have a way to find the top and bottom tfidf scores, i need to take this one step further to see how frequent those terms are, this will help me to decide to ignore n > 4 for example

I will make a function to use with lambda to generate a column of ngrams

In [None]:
def make_ngrams(word_list, n):
    length = len(word_list)
    if length < n:
        return None
    else:
        ngram_list = []
        for i in range(length - n + 1):
            ngram = ''
            for j in range(i, i+n):
                if j > i:
                    ngram += ' '
                ngram += word_list[j]
            ngram_list.append(ngram)
        return ngram_list

In [None]:
# it might stop throwing an error if you FIRST establish these columns, THEN assign values to them??
for n in range(3,4):
    title = str(n) + '_grams'
    X_train.loc[:, title] = X_train.loc[:, 'text_tokenized'].apply(lambda x: make_ngrams(x, n))

In [None]:
all_words = X_train.loc[:, '3_grams'].explode()
all_words_freq_dist = FreqDist(all_words)

all_words_set = set(all_words)

all_words_ordered = list(zip(*all_words_freq_dist.most_common(10)))

all_words_ordered

In [None]:
for item in largest[2]:
    print(all_words_freq_dist[item], item)

In [None]:
error pls

looks like n-grams help up to about n = 4, maybe more

let's explore the top 10 n-grams for each n

In [None]:
stopwords_list = stopwords.words('english')

def remove_stopwords(token_list):
    stopwords_removed = [token for token in token_list if token not in stopwords_list]
    return stopwords_removed

In [None]:
X_train["text_without_stopwords"] = X_train["text_tokenized"].apply(remove_stopwords)

In [None]:
tfidf = TfidfVectorizer(
    max_features=10,
    stop_words=stopwords_list
)

X_train_vectorized = tfidf.fit_transform(X_train["text"])

pd.DataFrame.sparse.from_spmatrix(X_train_vectorized, columns=tfidf.get_feature_names())

In [None]:
stopwords_removed_cv = run_test(baseline_model, X_train_vectorized, y_train)
stopwords_removed_cv

still very bad

In [None]:
print("Baseline:         ", baseline_cv.mean())
print("Stopwords removed:", stopwords_removed_cv.mean())

In [None]:
stemmer = SnowballStemmer(language="english")

def stem_and_tokenize(document):
    tokens = tokenizer.tokenize(document)
    return [stemmer.stem(token) for token in tokens]

In [None]:
stemmed_stopwords = [stemmer.stem(word) for word in stopwords_list]

In [None]:
tfidf = TfidfVectorizer(
    max_features=10,
    stop_words=stemmed_stopwords,
    tokenizer=stem_and_tokenize
)

X_train_vectorized = tfidf.fit_transform(X_train["text"])

pd.DataFrame.sparse.from_spmatrix(X_train_vectorized, columns=tfidf.get_feature_names())

In [None]:
stemmed_cv = run_test(baseline_model, X_train_vectorized, y_train)
stemmed_cv

In [None]:
print("Stopwords removed:", stopwords_removed_cv.mean())
print("Stemmed:          ", stemmed_cv.mean())

getting absolutely nowhere

In [None]:
X_train['num_words'] = X_train['text_tokenized'].apply(lambda x: len(x))

In [None]:
X_train["num_sentences"] = X_train["text"].apply(lambda x: len(sent_tokenize(x)))

In [None]:
X_train["label"] = [y_train[val] for val in X_train.index]

def plot_words(column, title):

    fig = plt.figure(figsize=(15, 5))
    fig.set_tight_layout(True)
    gs = fig.add_gridspec(1, 2)
    ax1 = fig.add_subplot(gs[0, :1])
    ax2 = fig.add_subplot(gs[0, 1:2])

    axes = [ax1, ax2]

    for index, category in enumerate(y_train.unique()):

        all_words = X_train[X_train["label"] == category][column].explode()
        freq_dist = FreqDist(all_words)
        top_10 = list(zip(*freq_dist.most_common(10)))
        tokens = top_10[0]
        counts = top_10[1]

        ax = axes[index]
        ax.bar(tokens, counts)

        ax.set_title(f"{title} {category}")
        ax.set_ylabel("Count")
        ax.yaxis.set_major_locator(MaxNLocator(integer=True))
        ax.tick_params(axis="x", rotation=90)

In [None]:
plot_words('text_without_stopwords', 'TITLE_HERE')

brainstorm feature engineering:

- whether / how many times the product is mentioned
- number of words
- contains an emoji
- look at bigrams (37.08)



In [None]:
def vectorized_plus_features()

pd.concat([
    X_test_vectorized_df, X_test[["num_sentences", "contains_price", "conta ], axis=1)

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [None]:
tweet_bigram_finder = BigramCollocationFinder.from_words(macbeth_words_stopped)

begin editing text
start with tokenizing

In [None]:
X_train['text_tokenized'] = X_train['text'].apply(tokenizer.tokenize)

explore complete vocabulary

In [None]:
all_words = X_train['text_tokenized'].explode()
all_words_freq_dist = FreqDist(all_words)

all_words_set = set(all_words)

all_words_ordered = list(zip(*all_words_freq_dist.most_common()))[0]

separate pos from neg

actually what I've done here is probably redundant to tfidf

In [None]:
all_pos_words = X_train[X_train.label == 'positive']['text_tokenized'].explode()
all_pos_words_freq_dist = FreqDist(all_pos_words)

In [None]:
all_pos_words_set = set(all_pos_words)

In [None]:
all_pos_words_ordered = list(zip(*all_pos_words_freq_dist.most_common()))[0]

In [None]:
all_non_words = X_train[X_train.label == 'not positive']['text_tokenized'].explode()
all_non_words_freq_dist = FreqDist(all_non_words)

In [None]:
all_non_words_set = set(all_non_words)

In [None]:
all_non_words_ordered = list(zip(*all_non_words_freq_dist.most_common()))[0]

In [None]:
word_differential = {}

for word in all_words_set:
    word_differential[word] = all_pos_words_freq_dist[word] - all_non_words_freq_dist[word]

In [None]:
sorted(word_differential.items(), key=itemgetter(1))

In [None]:
all_words_set - all_non_words_set

explore top ten frequency

consider reprogramming this since my all_words stuff makes it partially redundant

In [None]:
train_freq_dist = FreqDist(X_train["text_tokenized"].explode())

In [None]:
def visualize_top_10(freq_dist, title):
    # Extract data for plotting
    top_10 = list(zip(*freq_dist.most_common(10)))
    tokens = top_10[0]
    counts = top_10[1]
    
    # Set up plot and plot data
    fig, ax = plt.subplots()
    ax.bar(tokens, counts)
    
    # Customize plot appearance
    ax.set_title(title)
    ax.set_ylabel("Count")
    ax.yaxis.set_major_locator(MaxNLocator(integer=True))
    ax.tick_params(axis="x", rotation=90)
    
# visualize_top_10(train_freq_dist, "Top 10 Most Common Words")