import files

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer
from nltk import FreqDist
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
import nltk
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import sent_tokenize
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import ConfusionMatrixDisplay

load

In [2]:
df = pd.read_csv('../data/tweets.csv', encoding='unicode_escape')

explore

In [None]:
df.info()

rename

In [3]:
df.rename(columns={'tweet_text': 'text',
                   'emotion_in_tweet_is_directed_at': 'company',
                   'is_there_an_emotion_directed_at_a_brand_or_product': 'sentiment'},
          inplace = True)

look at missing values

In [None]:
df[df.text.isna()]

can't do anything without the text of the tweet, so drop

In [4]:
df.dropna(subset=['text'], inplace=True)

check duplicates

In [None]:
df.duplicated().value_counts()

drop duplicates, just text, doesn't matter if same text with different sentiment, etc. (still drop)

In [5]:
df.drop_duplicates(subset=['text'], inplace=True)

begin editing text

In [6]:
df['text'] = df['text'].str.lower()

In [None]:
df.sentiment.value_counts()

edit, simplify, rename

In [7]:
df['sentiment'].replace({'No emotion toward brand or product': 'neutral',
                         'Positive emotion': 'positive',
                         'Negative emotion': 'negative',
                         "I can't tell": 'other'
                        }, inplace=True)

In [None]:
df.sentiment.value_counts()

In [8]:
df['company'].replace(['iPad', 'Apple', 'iPad or iPhone App', 'iPhone', 'Other Apple product or service'], 'apple',
                     inplace=True)
df['company'].replace(['Google', 'Other Google product or service', 'Android App', 'Android'], 'google',
                     inplace=True)
df['company'].fillna('other',
                    inplace=True)

In [None]:
df.company.value_counts()

deal with missing company after tokenizing, etc. follow code below

In [9]:
apple_words = ['ipad', 'apple', 'iphone', 'itunes', 'ipad2']
google_words = ['google', 'android', 'blogger']

basic_token_pattern = r"(?u)\b\w\w+\b"
tokenizer = RegexpTokenizer(basic_token_pattern)

def company_fix(text, company):
    if company != 'other':
        return company
    else:
        apple, google = False, False
        text_tokenized = tokenizer.tokenize(text)
        for word in apple_words:
            if word in text_tokenized:
                apple = True
                break
        for word in google_words:
            if word in text_tokenized:
                google = True
                break
        if apple & ~google:
            return 'apple'
        elif google & ~apple:
            return 'google'
        elif apple & google:
            return 'both'
        else:
            return 'neither'

df['company'] = df.apply(lambda x: company_fix(x.text, x.company), axis=1)

In [None]:
df.company.value_counts()

could do more here to explore the neither and both values

move on to language processing

train-test split

In [10]:
X, y = df['text'].to_frame(), df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)

tokenize

In [11]:
X_train['text_tokenized'] = X_train['text'].apply(tokenizer.tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['text_tokenized'] = X_train['text'].apply(tokenizer.tokenize)


explore complete vocabulary

In [24]:
vocabulary = set()

for row in X_train['text_tokenized']:
    vocabulary.update(set(row))
    
len(vocabulary)

8876

In [27]:
sorted(vocabulary)

['00',
 '000',
 '00am',
 '00pm',
 '01am',
 '03',
 '0310apple',
 '06',
 '08',
 '10',
 '100',
 '100s',
 '100tc',
 '101',
 '106',
 '10am',
 '10k',
 '10x',
 '10x2',
 '11',
 '1100',
 '1154',
 '11am',
 '11ntc',
 '11p',
 '11pm',
 '11th',
 '12',
 '120',
 '1223',
 '125',
 '128',
 '12am',
 '12b',
 '12bn',
 '12th',
 '13',
 '130',
 '1300',
 '14',
 '1406',
 '1408',
 '141164002609303',
 '1413',
 '1415',
 '1422',
 '1443',
 '14th',
 '15',
 '150',
 '1500',
 '150m',
 '150mm',
 '157',
 '15am',
 '15k',
 '15pm',
 '15slides',
 '16',
 '16162',
 '165',
 '169',
 '16gb',
 '16mins',
 '17',
 '18',
 '19',
 '1980',
 '1980s',
 '1985',
 '1986',
 '1990style',
 '1991',
 '1k',
 '1m',
 '1pm',
 '1s',
 '1st',
 '20',
 '200',
 '2009',
 '2010',
 '2011',
 '2012',
 '206',
 '206k',
 '20s',
 '21',
 '210',
 '2100',
 '214',
 '22',
 '22sxsw',
 '23',
 '230',
 '24',
 '24587',
 '25',
 '250',
 '250k',
 '26svo3m',
 '27',
 '270',
 '285',
 '29',
 '2b',
 '2day',
 '2g',
 '2h',
 '2honor',
 '2moro',
 '2nd',
 '2nite',
 '2rd',
 '2s',
 '2wks',
 '

explore top ten frequency

In [None]:
train_freq_dist = FreqDist(X_train["text_tokenized"].explode())

In [13]:
def visualize_top_10(freq_dist, title):
    # Extract data for plotting
    top_10 = list(zip(*freq_dist.most_common(10)))
    tokens = top_10[0]
    counts = top_10[1]
    
    # Set up plot and plot data
    fig, ax = plt.subplots()
    ax.bar(tokens, counts)
    
    # Customize plot appearance
    ax.set_title(title)
    ax.set_ylabel("Count")
    ax.yaxis.set_major_locator(MaxNLocator(integer=True))
    ax.tick_params(axis="x", rotation=90)
    
# visualize_top_10(train_freq_dist, "Top 10 Most Common Words")

explore vectorized data with just top ten frequent words

In [14]:
tfidf = TfidfVectorizer(max_features=8000)
X_train_vectorized = tfidf.fit_transform(X_train['text'])
pd.DataFrame.sparse.from_spmatrix(X_train_vectorized, columns=tfidf.get_feature_names())

Unnamed: 0,00,000,00am,00pm,01am,03,0310apple,06,08,10,...,ûïwin,ûò,ûòand,ûó,ûócan,ûójust,ûólewis,ûólots,ûómy,ûóthe
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7249,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


perform naive bayes on this vectorized set to make first model and look at results

In [15]:
baseline_model = MultinomialNB()

below is a failed attempt to make a function that automates the testing, maybe return to this later

In [16]:
## iteration 1
def run_test(model, preprocessed_X_train, y_train):
    return cross_val_score(model, preprocessed_X_train, y_train).mean()

# def run_test(columns = [], model = baseline_model, tfidf = tfidf):
    
#     X_train_vectorized = tfidf.fit_transform(X_train['text'])
    
#     preprocessed_X_train = pd.concat([
#         X_train_vectorized, X_train[columns]
#     ], axis=1)
    
#     return cross_val_score(model, preprocessed_X_train, y_train).mean()

here is the traditional way to run the test

In [17]:
baseline_cv = run_test(baseline_model, X_train_vectorized, y_train)
baseline_cv

0.633480168254949

look at plurality winner to see how first model (naive bayes) compares

In [18]:
y_train.value_counts(normalize=True)

sentiment
neutral     0.591975
positive    0.327634
negative    0.062879
other       0.017512
Name: proportion, dtype: float64

naive bayes is essentially exactly same as plurality

first model is bad

introduce and remove stopwords, redo model same as before

In [None]:
stopwords_list = stopwords.words('english')

def remove_stopwords(token_list):
    stopwords_removed = [token for token in token_list if token not in stopwords_list]
    return stopwords_removed

In [None]:
X_train["text_without_stopwords"] = X_train["text_tokenized"].apply(remove_stopwords)

In [None]:
tfidf = TfidfVectorizer(
    max_features=10,
    stop_words=stopwords_list
)

X_train_vectorized = tfidf.fit_transform(X_train["text"])

pd.DataFrame.sparse.from_spmatrix(X_train_vectorized, columns=tfidf.get_feature_names())

In [None]:
stopwords_removed_cv = run_test(baseline_model, X_train_vectorized, y_train)
stopwords_removed_cv

still very bad

In [None]:
print("Baseline:         ", baseline_cv.mean())
print("Stopwords removed:", stopwords_removed_cv.mean())

In [None]:
stemmer = SnowballStemmer(language="english")

def stem_and_tokenize(document):
    tokens = tokenizer.tokenize(document)
    return [stemmer.stem(token) for token in tokens]

In [None]:
stemmed_stopwords = [stemmer.stem(word) for word in stopwords_list]

In [None]:
tfidf = TfidfVectorizer(
    max_features=10,
    stop_words=stemmed_stopwords,
    tokenizer=stem_and_tokenize
)

X_train_vectorized = tfidf.fit_transform(X_train["text"])

pd.DataFrame.sparse.from_spmatrix(X_train_vectorized, columns=tfidf.get_feature_names())

In [None]:
stemmed_cv = run_test(baseline_model, X_train_vectorized, y_train)
stemmed_cv

In [None]:
print("Stopwords removed:", stopwords_removed_cv.mean())
print("Stemmed:          ", stemmed_cv.mean())

getting absolutely nowhere

In [None]:
X_train['num_words'] = X_train['text_tokenized'].apply(lambda x: len(x))

In [None]:
X_train["num_sentences"] = X_train["text"].apply(lambda x: len(sent_tokenize(x)))

In [None]:
X_train["label"] = [y_train[val] for val in X_train.index]

def plot_words(column, title):

    fig = plt.figure(figsize=(15, 9))
    fig.set_tight_layout(True)
    gs = fig.add_gridspec(2, 2)
    ax1 = fig.add_subplot(gs[0, :1])
    ax2 = fig.add_subplot(gs[0, 1:2])
    ax3 = fig.add_subplot(gs[1, :1])
    ax4 = fig.add_subplot(gs[1, 1:2])

    axes = [ax1, ax2, ax3, ax4]

    for index, category in enumerate(y_train.unique()):

        all_words = X_train[X_train["label"] == category][column].explode()
        freq_dist = FreqDist(all_words)
        top_10 = list(zip(*freq_dist.most_common(10)))
        tokens = top_10[0]
        counts = top_10[1]

        ax = axes[index]
        ax.bar(tokens, counts)

        ax.set_title(f"{title} {category}")
        ax.set_ylabel("Count")
        ax.yaxis.set_major_locator(MaxNLocator(integer=True))
        ax.tick_params(axis="x", rotation=90)

In [None]:
plot_words('text_without_stopwords', 'fuckoff')

brainstorm feature engineering:

- whether / how many times the product is mentioned
- number of words
- contains an emoji
- look at bigrams (37.08)



In [None]:
X_train