In [106]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [107]:
twitter = pd.read_csv("preprocessed_data.csv")
twitter_test = pd.read_csv("twitter_test_data.csv")

In [108]:
twitter

Unnamed: 0,_score,tweet_id,text,identification,emotion,Category
0,299,0x2e7caf,my sunday lineup girlstrip power üí™üèæüòé,train,fear,1
1,784,0x2eff99,therell never be a time than now to what you...,train,trust,2
2,358,0x3813a3,taluuluu riiiiiiiiiiiiiiiiight üôÑ,train,anger,3
3,1021,0x362eaa,never give up on your dream me to make your ...,train,anticipation,4
4,367,0x286e23,pearlharper youre the most responsible adult i...,train,surprise,5
...,...,...,...,...,...,...
317658,887,0x27028e,i better go eat something because i havent eat...,train,anticipation,4
317659,574,0x30ffbb,russia had no bearing on electionpresident did...,train,anger,3
317660,91,0x29ccd4,artziiflower beingsalmankhan that meansnow u d...,train,sadness,8
317661,532,0x345ac6,talkmaster now in greenville schigh of enjoy...,train,fear,1


In [109]:
twitter_test

Unnamed: 0,_score,tweet_id,text,identification
0,232,0x28b412,"Confident of your obedience, I write to you, k...",test
1,989,0x2de201,"""Trust is not the same as faith. A friend is s...",test
2,66,0x218443,When do you have enough ? When are you satisfi...,test
3,104,0x2939d5,"God woke you up, now chase the day #GodsPlan #...",test
4,310,0x26289a,"In these tough times, who do YOU turn to as yo...",test
...,...,...,...,...
411967,602,0x2913b4,"""For this is the message that ye heard from th...",test
411968,598,0x2a980e,"""There is a lad here, which hath five barley l...",test
411969,827,0x316b80,When you buy the last 2 tickets remaining for ...,test
411970,368,0x29d0cb,I swear all this hard work gone pay off one da...,test


### Tokenizer

In [115]:
#Dividing the dataframe into a train and a test sections
from sklearn.model_selection import train_test_split

#Choose either Category or emotion
x_train,x_test,y_train,y_test = train_test_split(twitter,twitter.Category,
                                                test_size=0.2, random_state = 42)

In [116]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((254130, 6), (63533, 6), (254130,), (63533,))

In [117]:
y_train

255439    5
59813     8
101110    6
212739    2
43393     1
         ..
119879    1
259178    7
131932    6
146867    3
121958    6
Name: Category, Length: 254130, dtype: int64

In [114]:
## deal with label (string -> one-hot)
import keras
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
print('check label: ', label_encoder.classes_)
print('\n## Before convert')
print('y_train[0:4]:\n', y_train[0:4])
print('\ny_train.shape: ', y_train.shape)
print('y_test.shape: ', y_test.shape)

def label_encode(le, labels):
    enc = le.transform(labels)
    #return keras.utils.to_categorical(enc)
    return keras.utils.np_utils.to_categorical(enc)   #Allison/Moo said so, because of version

def label_decode(le, one_hot_label):
    dec = np.argmax(one_hot_label, axis=1)
    return le.inverse_transform(dec)

y_train = label_encode(label_encoder, y_train)
y_test = label_encode(label_encoder, y_test)

print('\n\n## After convert')
print('y_train[0:4]:\n', y_train[0:4])
print('\ny_train.shape: ', y_train.shape)
print('y_test.shape: ', y_test.shape)


check label:  [1 2 3 4 5 6 7 8]

## Before convert
y_train[0:4]:
 255439    5
59813     8
101110    6
212739    2
Name: Category, dtype: int64

y_train.shape:  (254130,)
y_test.shape:  (63533,)


## After convert
y_train[0:4]:
 [[0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]]

y_train.shape:  (254130, 8)
y_test.shape:  (63533, 8)


In [125]:
#create tokenizer for our data
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000, oov_token='<UNK>')
tokenizer.fit_on_texts(x_train["text"])


#convert text data to numerical indexes
train_seqs=tokenizer.texts_to_sequences(x_train["text"])
test_seqs=tokenizer.texts_to_sequences(x_test["text"])

train_seqs=tf.keras.preprocessing.sequence.pad_sequences(train_seqs, maxlen=256, padding="post")
test_seqs=tf.keras.preprocessing.sequence.pad_sequences(test_seqs, maxlen=256, padding="post")

### Model

In [126]:
train_seqs.shape, y_train.shape

((254130, 256), (254130,))

In [122]:
embedding_dim = 16

model = tf.keras.Sequential([
  tf.keras.layers.Embedding(10000, embedding_dim),
  tf.keras.layers.GlobalAveragePooling1D(),
  tf.keras.layers.Dense(1, activation='sigmoid'),
  tf.keras.layers.Dense(1, activation='sigmoid')])

model.summary()

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

es = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max')
callbacks = [es] #Early Stopping

BATCH_SIZE = 25
EPOCHS = 20

history = model.fit(train_seqs, y_train
                    , batch_size = BATCH_SIZE
                    , epochs = EPOCHS
                    , validation_split = 0.1
                    , callbacks = callbacks)

model.evaluate(test_seqs, y_test)

Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_17 (Embedding)    (None, None, 16)          160000    
                                                                 
 global_average_pooling1d_17  (None, 16)               0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_17 (Dense)            (None, 1)                 17        
                                                                 
 dense_18 (Dense)            (None, 1)                 2         
                                                                 
Total params: 160,019
Trainable params: 160,019
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20


[-63.159420013427734, 0.12450222671031952]

## Continue from here:
https://towardsdatascience.com/tensorflow-2-0-data-transformation-for-text-classification-b86ee2ad8877