In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.probability import FreqDist
from nltk import ConditionalProbDist

from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import keras
import tensorflow as tf
from tensorflow.keras import models, layers, Sequential, regularizers
from tensorflow.keras.callbacks import EarlyStopping

# from gensim.models import Word2Vec

%run twitter.py
# %run ./gensim/gensim.py

## DATA COLLECTION

In [2]:
df = pd.read_csv('./data/judge-1377884607_tweet_product_company.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


In [4]:
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


## DATA SCRUBBING

In [5]:
df.drop(labels=['emotion_in_tweet_is_directed_at'], inplace=True, axis=1)

In [6]:
df.columns=['tweet_raw', 'sentiment']

In [7]:
df.sentiment = df.sentiment.apply(lambda x: 'neutral' if x =='No emotion toward brand or product' else x.lower())
df.sentiment = df.sentiment.apply(lambda x: 'positive' if x =='positive emotion' else x.lower())
df.sentiment = df.sentiment.apply(lambda x: 'negative' if x =='negative emotion' else x.lower())

In [8]:
df.dropna(inplace=True)

In [9]:
df.head()

Unnamed: 0,tweet_raw,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,negative
1,@jessedee Know about @fludapp ? Awesome iPad/i...,positive
2,@swonderlin Can not wait for #iPad 2 also. The...,positive
3,@sxsw I hope this year's festival isn't as cra...,negative
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,positive


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9092 entries, 0 to 9092
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_raw  9092 non-null   object
 1   sentiment  9092 non-null   object
dtypes: object(2)
memory usage: 213.1+ KB


## PREPROCESSING

In [11]:
nltk.download('words')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package words to /Users/boula/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/boula/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /Users/boula/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /Users/boula/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
urls = url_extractor(df.tweet_raw)

In [13]:
hashtag_list = hashtags(df.tweet_raw)

In [14]:
df['tweet'] = CleanUp(df.tweet_raw)

In [15]:
tokenizer = RegexpTokenizer('[a-zA-Z0-9]+')
df.tweet  =  df.tweet.apply(lambda x: tokenizer.tokenize(x))

In [16]:
stop_words_en = set(stopwords.words('english'))
df.tweet      = df.tweet.apply(lambda tweet: [word for word in tweet if word not in stop_words_en])

In [17]:
lemmatizer = WordNetLemmatizer()
df.tweet   = df.tweet.apply(lambda tweet: [lemmatizer.lemmatize(word) for word in tweet])

In [18]:
df.tweet = df.tweet.apply(lambda tweet: ' '.join(tweet))
# df.tweet_lemmatized = df.tweet_lemmatized.apply(lambda tweet: ' '.join(tweet))

## FEATURES ENGINEERING

In [19]:
twitt = Twitter(df.tweet)

In [None]:
df['lexical_diversity']= twitt.Lexical_Diversity

In [None]:
df['word_count'] = twitt.WordsCount

## Modeling

In [20]:
ohe = OneHotEncoder(sparse=False)
Y_ohe = ohe.fit_transform(df.sentiment.values.reshape(-1,1))
print(f"{'Y_ohe Categories:':20}{ohe.categories_[0]}")
print(f"{'Y_ohe Shape':20}{Y_ohe.shape}")

Y_ohe Categories:   ["i can't tell" 'negative' 'neutral' 'positive']
Y_ohe Shape         (9092, 4)


## Word Embedding - Gensim Word2Vec

In [None]:
import gensim

In [21]:
from gensim.models import Word2Vec

instantiate Word2Vec model to obtain an word embedding layer which involve creating 7,570 x 100 sized matrix, whereby we look up a 300 length vector representation for each of the 7,570 words.

furthermore we're going to train Word2Vec model with the method SKIP-GRAM which use words to predict a target context.

In [22]:
w2v_model = Word2Vec(
    df.tweet.apply(lambda x: x.split()),
    size=100,
    window=5,
    min_count=1,
    sg=1,
    hs=0,
    negative=5,
    workers=4
)

In [23]:
w2v_model.train(
    df.tweet.apply(lambda x: x.split()),
    total_examples=w2v_model.corpus_count,
    epochs=1
)

(67411, 84160)

In [32]:
from sklearn.preprocessing import LabelEncoder

In [None]:
ohe_w2v = LabelEncoder()
X_w2v = ohe_w2v.fit_transform()

In [36]:
X_gensim  = w2v_model.wv.vectors
# ohe_w2v   = OneHotEncoder(sparse=False)
# X_w2v     = ohe_w2v.fit_transform(X_gensim)

In [None]:
print(f"dataframe shape {df.shape[0]}\nWord2Vec  shape {X_gensim.shape[0]}")

In [None]:
Test_Size = int(X_w2v.shape[0]*.15)

X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(X_w2v,
                                                                    Y_ohe,
                                                                    test_size=Test_Size,
                                                                    random_state=67)

X_train_w2v, X_val_w2v, y_train_w2v, y_val_w2v = train_test_split(X_train_w2v,
                                                                  y_train_w2v,
                                                                  test_size=Test_Size,
                                                                  random_state=67)

print(f"Train\n\t{'X :':5}{X_train_w2v.shape}\n\t{'Y :':5}{y_train_w2v.shape}")
print(f"Test\n\t{'X :':5}{X_test_w2v.shape}\n\t{'Y :':5}{y_test_w2v.shape}")
print(f"Validation\n\t{'X :':5}{X_val_w2v.shape}\n\t{'Y :':5}{y_val_w2v.shape}")

## MODELING

In [None]:
# Test_Size = int(df.shape[0]*.15)

# cVectorizer = CountVectorizer() # ngram_range=(1,2)

# X_train, X_test, y_train, y_test = train_test_split(df.tweet, Y_ohe, test_size=Test_Size, random_state=67)

# X_train   = cVectorizer.fit_transform(X_train)
# X_train   = X_train.toarray()

# X_test    = cVectorizer.transform(X_test)
# X_test    = X_test.toarray()

# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=Test_Size, random_state=67)

# print(f"Train\n\t{'X :':5}{X_train.shape}\n\t{'Y :':5}{y_train.shape}")
# print(f"Test\n\t{'X :':5}{X_test.shape}\n\t{'Y :':5}{y_test.shape}")
# print(f"Validation\n\t{'X :':5}{X_val.shape}\n\t{'Y :':5}{y_val.shape}")

In [None]:
Test_Size = int(df.shape[0]*.15)

tfidfVectorizer = TfidfVectorizer() #ngram_range=(1,2)

X_train, X_test, y_train, y_test = train_test_split(df.tweet, Y_ohe, test_size=Test_Size, random_state=67)

X_train   = tfidfVectorizer.fit_transform(X_train)
X_train   = X_train.toarray()

X_test    = tfidfVectorizer.transform(X_test)
X_test    = X_test.toarray()

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=Test_Size, random_state=67)

print(f"Train\n\t{'X :':5}{X_train.shape}\n\t{'Y :':5}{y_train.shape}")
print(f"Test\n\t{'X :':5}{X_test.shape}\n\t{'Y :':5}{y_test.shape}")
print(f"Validation\n\t{'X :':5}{X_val.shape}\n\t{'Y :':5}{y_val.shape}")

In [None]:
X_train[0].shape

In [None]:
model = Sequential()

In [None]:
model.add(
    layers.Conv2D(32,
    )
)

In [None]:
# tf.keras.losses.BinaryCrossentropy

In [None]:
# [np.argmax(p) for p in model.predict(X_train)]

In [None]:
# np.argmax([p for p in model.predict(X_train)[0]])

In [None]:
# prediction(model, ohe, X_train[[600]])

In [None]:
model = Sequential()
model.add(layers.Dense(units=64, activation='relu', input_dim=X_train.shape[1]))

model.add(layers.Dense(units=32, activation='relu'))
model.add(layers.Dense(units=16, activation='relu'))
model.add(layers.Dense(units=8, activation='relu'))

model.add(layers.Dense(units=4, activation='softmax'))
model.compile(
    optimizer='adam', 
    loss='categorical_crossentropy', 
    metrics=['accuracy', 'AUC']
    )
model.summary()

In [None]:
history = model.fit(X_train,
                    y_train,
                    validation_data=(X_val, y_val),
#                     validation_batch_size=10,
#                     validation_steps=10,
                    epochs=20,
                    batch_size=X_train.shape[0],
#                     callbacks=[EarlyStopping(patience=5, restore_best_weights=True, verbose=1)]
                    )

In [None]:
prediction(model, ohe, X_train[[0]])

In [None]:
plot_history(history)

In [None]:
tf.keras.optimizers.

In [None]:
model = Sequential()
model.add(layers.Dense(units=64, activation='relu', input_dim=X_train.shape[1]))

model.add(layers.Dense(units=32, activation='relu'))
model.add(layers.Dense(units=16, activation='relu'))
model.add(layers.Dense(units=8, activation='relu', kernel_regularizer=regularizers.l2()))

model.add(layers.Dense(units=4, activation='softmax'))
model.compile(
    optimizer='adam', 
    loss='categorical_crossentropy', 
    metrics='AUC'
    )
model.summary()

In [None]:
history = model.fit(X_train,
                    y_train,
                    validation_data=(X_val, y_val),
#                     validation_batch_size=20,
#                     validation_steps=10,
                    epochs=200,
                    batch_size=X_train.shape[0],
                    callbacks=[EarlyStopping(patience=5, restore_best_weights=True, verbose=1)]
                    )

In [None]:
prediction(model, ohe, X_train[[0]])

In [None]:
plot_history(history)

In [None]:
model = Sequential()
model.add(layers.Dense(units=64, activation='relu', input_dim=X_train.shape[1]))

model.add(layers.Dense(units=32, activation='relu', kernel_regularizer=regularizers.l2()))
model.add(layers.Dense(units=16, activation='relu', kernel_regularizer=regularizers.l2()))
model.add(layers.Dense(units=8, activation='relu', kernel_regularizer=regularizers.l2()))

model.add(layers.Dense(units=4, activation='softmax'))
model.compile(
    optimizer='RMSProp', 
    loss='categorical_crossentropy', 
    metrics='accuracy'
    )
model.summary()

In [None]:
history = model.fit(
    X_train, 
    y_train, 
    validation_data=(X_val, y_val), 
    epochs=400,
    batch_size=X_train.shape[0],
    callbacks=[EarlyStopping(patience=5, restore_best_weights=True, verbose=1)]
    )

In [None]:
prediction(model, ohe, X_train[[0]])

In [None]:
plot_history(history)

## Stochastic Gradient Descent

In [None]:
model = Sequential()
model.add(layers.Dense(units=4, activation='relu', input_dim=X_train.shape[1]))

model.add(layers.Dense(units=8, activation='relu'))
model.add(layers.Dense(units=16, activation='relu'))
model.add(layers.Dense(units=8, activation='relu'))

model.add(layers.Dense(units=4, activation='softmax'))
model.compile(
    optimizer='RMSProp', 
    loss='categorical_crossentropy', 
    metrics=['AUC']
    )
model.summary()

In [None]:
history = model.fit(X_train,
                   y_train,
                   validation_data=(X_val, y_val),
                   epochs=25,
                   batch_size=1 
                   )

In [None]:
prediction(model, ohe, X_train[[0]])

In [None]:
plot_history(history)

In [None]:
model = Sequential()
model.add(layers.Dense(units=4, activation='relu', input_dim=X_train.shape[1]))

model.add(layers.Dense(units=8, activation='relu'))
model.add(layers.Dense(units=16, activation='relu'))
model.add(layers.Dense(units=32, activation='relu'))

model.add(layers.Dense(units=4, activation='softmax'))
model.compile(
    optimizer='adam', 
    loss='categorical_crossentropy', 
    metrics=['AUC']
    )
model.summary()

In [None]:
history = model.fit(X_train,
                   y_train,
                   validation_data=(X_val, y_val),
                   epochs=15,
                   batch_size=2 
                   )

In [None]:
plot_history(history)

In [None]:
model = Sequential()
model.add(layers.Dense(units=4, activation='relu', input_dim=X_train.shape[1]))

model.add(layers.Dense(units=4, activation='relu', kernel_regularizer=regularizers.l2()))
model.add(layers.Dense(units=8, activation='relu', kernel_regularizer=regularizers.l2()))
model.add(layers.Dense(units=16, activation='relu', kernel_regularizer=regularizers.l2()))

model.add(layers.Dense(units=4, activation='softmax'))
model.compile(
    optimizer='adam', 
    loss='categorical_crossentropy', 
    metrics=['accuracy']
    )
model.summary()

In [None]:
history = model.fit(X_train,
                   y_train,
                   validation_data=(X_val, y_val),
                   epochs=15,
                   batch_size=15 
                   )

In [None]:
prediction(model, ohe, X_train[[0]])

In [None]:
plot_history(history)

In [None]:
model.predict(X_test[[0]])

In [None]:
predictions_df = pd.DataFrame(
    zip(ohe.categories_[0], model.predict(X_test[[0]])[0]),
    columns=['Target', 'Softmax']
    )
predictions_df