# 3.3 A simple text classifier

Dataset from https://www.kaggle.com/kazanova/sentiment140

## Data loading and preprocessing

In [1]:
import pandas as pd
import tensorflow as tf
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.layers import Bidirectional, Dense, Dropout, Embedding, LSTM
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

In [2]:
data = pd.read_csv('https://storage.googleapis.com/activation-function/csv/noemoticon.csv', 
                   encoding='latin-1',
                   header=None)
data = data.sample(frac=1) 
data.head()

Unnamed: 0,0,1,2,3,4,5
649564,0,2237327813,Fri Jun 19 05:36:31 PDT 2009,NO_QUERY,AwesomeAmanda76,Awww.... looks like #peterfacinelli won't make...
1519237,4,2176056744,Mon Jun 15 02:47:16 PDT 2009,NO_QUERY,HighFashionSan,Omg only 3 days even less
220439,0,1976624671,Sat May 30 18:36:45 PDT 2009,NO_QUERY,alycekeli,@splattt_twloha the beginning of the end has c...
1405730,4,2055241769,Sat Jun 06 08:57:54 PDT 2009,NO_QUERY,xoAlexHeartsxo,@DAY26addict LOL Thanks!!
843509,4,1563885449,Mon Apr 20 00:43:03 PDT 2009,NO_QUERY,agynamix,Yesterday I declared the bicycle season open w...


In [3]:
print(data[5][0])

@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D


In [4]:
# Train-test split
train_size = int(0.7*len(data))
features = data[5]
targets = data[0]
X_train, X_test = features.values[:train_size], features.values[train_size:]
y_train, y_test = targets.values[:train_size], targets.values[train_size:]

y_train[y_train==2] = 1
y_train[y_train==4] = 2

y_test[y_test==2] = 1
y_test[y_test==4] = 2

In [5]:
# Count vectorize X_train and X_test.
count_vectorizer = CountVectorizer(max_features=100)  # Limit memory consumption of features.
X_train_num = count_vectorizer.fit_transform(X_train).toarray()  # Turn sparse array to into dense array with toarray().
X_test_num = count_vectorizer.transform(X_test).toarray()

In [6]:
instance = 46544
print(X_train[instance])
print(X_train_num[instance])

@brodiejay oh oh me too!!! I thinking of getting the box set 
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 2 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [7]:
print(set(y_train))
print(y_train[:20])

{0, 2}
[0 2 0 2 2 2 0 0 2 2 0 2 2 2 0 0 0 0 0 0]


## Simple Fully-Connected Network

In [8]:
model = Sequential([
    Dense(256, activation=tf.nn.leaky_relu),  # x if x > 0 else (alpha * x)
    Dropout(0.5),
    Dense(3)
])

# If you have from_logits=True on the loss function, you do
# not need to have a softmax or sigmoid on the output layer.
model.compile(loss=SparseCategoricalCrossentropy(from_logits=True),
              optimizer=Adam(),
              metrics=['accuracy'])

history = model.fit(X_train_num,
                    y_train,
                    batch_size=64,
                    epochs=3,
                    validation_split=0.1,
                    verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


## Embedding with Bidirectional LSTM

In [9]:
vocab_size = 10000
embedding_dim = 32
max_length = 500

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    Bidirectional(LSTM(32)),
    Dense(32, activation='relu'),
    # Dense(256, activation=tf.nn.leaky_relu),  # x if x > 0 else (alpha * x)
    # Dropout(0.5),
    Dense(3)
])

model.compile(loss=SparseCategoricalCrossentropy(from_logits=True),
              optimizer=Adam(),
              metrics=['accuracy'])

history = model.fit(X_train_num,
                    y_train,
                    batch_size=64,
                    epochs=3,
                    validation_split=0.1,
                    verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3
