# Neural Network Model

In [1]:
import pandas as pd
import numpy as np
import keras

#### Upload the CSV created from Kaggle Competition Preprocessing.csv

In [2]:
%%time
#twitter = pd.read_csv("preprocessed_data.csv")    #normal one used before
#twitter = pd.read_csv("sorted_preprocessed_data.csv")  #with lowest emotion scores
#twitter = pd.read_csv("greater_than_some_score.csv")
twitter = pd.read_csv("final_dataset.csv")

Wall time: 11.7 s


In [3]:
#pd.set_option('display.max_rows', 1000)
twitter.head()

Unnamed: 0,_score,tweet_id,text,identification,emotion,text_stemmed,Category
0,883,0x292d69,forever daddys little girl when daddy goes lo...,train,sadness,forev daddi littl girl when daddi goe look for...,8
1,391,0x271dff,the day before a big event and my team has eve...,train,trust,the day befor a big event and my team ha every...,2
2,507,0x305ac3,nikkinicolex i was going to do that this year 😂😩,train,sadness,nikkinicolex i wa go to do that thi year 😂😩,8
3,560,0x2415c8,donaldtrump of obstruction of justice and i...,train,sadness,donaldtrump of obstruct of justic and interfer...,8
4,348,0x378a4c,is holy and just therefore he must hate and p...,train,anticipation,is holi and just therefor he must hate and pun...,4


#### Dividing the dataframe into a train and a test sections. For the input section I tried with both "text" and "text_stemmed"

In [4]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(twitter.text,twitter.emotion,
                                                test_size=0.2, random_state = 42)

#### Make a BOW for the top 20k - 25k (best results were in this range) max features. Using nltk.word_tokenize to accept emojis in the bag of words.

In [5]:
%%time
import nltk
from sklearn.feature_extraction.text import CountVectorizer

BOW = CountVectorizer(tokenizer=nltk.word_tokenize, max_features=25000)

BOW.fit(x_train)



Wall time: 5min 16s


CountVectorizer(max_features=25000,
                tokenizer=<function word_tokenize at 0x000001AA8618FEE0>)

#### Transform the "text" for both train and testing data

In [6]:
x_train = BOW.transform(x_train)
#y_train = y_train

x_test = BOW.transform(x_test)
#y_test = y_test

#### Use 1 hot encoding to deal with strings:

In [7]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_train)

def label_encode(le, labels):
    enc = le.transform(labels)
    return keras.utils.np_utils.to_categorical(enc)

def label_decode(le, one_hot_label):
    dec = np.argmax(one_hot_label, axis=1)
    return le.inverse_transform(dec)

y_train = label_encode(label_encoder, y_train)
y_test = label_encode(label_encoder, y_test)

In [8]:
# I/O check
input_shape = x_train.shape[1]
print('input_shape: ', input_shape)

output_shape = len(label_encoder.classes_)
print('output_shape: ', output_shape)

input_shape:  25000
output_shape:  8


#### Now I proceed to build the model. The lab model with hyperparameters modifications gave the best result. Even so, I also tried several other models you can see in the following pictures, but unfortunately this one gave me the best result. I also added another hidden layer, too.

Other tries:
![Snapshot](Models_tried.PNG)

In [9]:
from keras.models import Model
from keras.layers import Input, Dense
from keras.layers import ReLU, Softmax

# input layer
model_input = Input(shape=(input_shape, ))  # 10000
X = model_input

# 1st hidden layer
X_W1 = Dense(units=32)(X)  # Original: 64
H1 = ReLU()(X_W1)

# 2nd hidden layer
H1_W2 = Dense(units=32)(H1)  # Original: 64
H2 = ReLU()(H1_W2)

# 3rd hidden layer   (CREATED BY ME)
H1_W4 = Dense(units=32)(H2)  # Original: 64
H4 = ReLU()(H1_W4)

# output layer
#Original output layer
# H2_W3 = Dense(units=output_shape)(H2)  # 8
# H3 = Softmax()(H2_W3)

H2_W3 = Dense(units=output_shape)(H4)  # 8
H3 = Softmax()(H2_W3)

model_output = H3

# create model
model = Model(inputs=[model_input], outputs=[model_output])

# loss function & optimizer
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# show model construction
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 25000)]           0         
                                                                 
 dense (Dense)               (None, 32)                800032    
                                                                 
 re_lu (ReLU)                (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 32)                1056      
                                                                 
 re_lu_1 (ReLU)              (None, 32)                0         
                                                                 
 dense_2 (Dense)             (None, 32)                1056      
                                                                 
 re_lu_2 (ReLU)              (None, 32)                0     

#### I tried also some different hyperparameters here, and added Early Stopping, which allow us 2 things. One is to stop overfitting by stopping the unnecesary epochs (when validation loss starts to increase), and also decreases training time by reducing those unnecesary epochs. In this model only 2 epochs run before it overfits, so then the model stops training.

In [11]:
from keras.callbacks import CSVLogger
import tensorflow as tf

epochs = 5 #25
batch_size = 64 #32, 100

es = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max')
callbacks = [es] #Early Stopping

# training!
history = model.fit(x_train, y_train, 
                    epochs=epochs, 
                    batch_size=batch_size, 
                    callbacks=callbacks,
                    validation_data = (x_test, y_test))
print('training finish')

Epoch 1/5




Epoch 2/5
training finish


#### Predict on 'testing' data to see its accuracy:

In [12]:
## predict
pred_result = model.predict(x_test, batch_size=128) #128
pred_result[:5]

array([[7.2603702e-04, 9.2258877e-01, 3.3979362e-03, 3.9225798e-03,
        4.4723894e-02, 5.8233822e-03, 2.6032305e-03, 1.6214149e-02],
       [5.0909184e-03, 7.9433107e-01, 3.7745996e-03, 2.9500829e-02,
        1.1765296e-01, 1.4022427e-02, 1.1131980e-02, 2.4495170e-02],
       [2.2109637e-02, 1.2432319e-01, 3.0477325e-02, 3.3597231e-02,
        6.0769755e-01, 8.4985696e-02, 1.2158559e-02, 8.4650829e-02],
       [1.1541225e-02, 1.9554177e-01, 3.7337534e-02, 4.9067251e-02,
        3.1124482e-01, 9.6911915e-02, 2.3946233e-02, 2.7440926e-01],
       [1.7788600e-04, 2.3848298e-01, 3.1241053e-04, 3.7969176e-03,
        2.6607934e-01, 9.4746356e-04, 1.1908304e-03, 4.8901212e-01]],
      dtype=float32)

#### Decode it to see the actual labels:

In [13]:
pred_result = label_decode(label_encoder, pred_result)
pred_result[:5]

array(['anticipation', 'anticipation', 'joy', 'joy', 'trust'],
      dtype=object)

#### Final training set accuracy:

In [14]:
from sklearn.metrics import accuracy_score

print('testing accuracy: {}'.format(round(accuracy_score(label_decode(label_encoder, y_test), pred_result), 4)))

#Best until now 0.5526

testing accuracy: 0.5495


## Predicting on test data

#### Now i just upload the twitter_test_data.csv we got from the Kaggle Competition Preprocessing jupyter notebook and apply the recently acquired model.

In [15]:
twitter_test_data = pd.read_csv("twitter_test_data.csv")

#### Transforming the testing data into a BOW and then predicting:

In [16]:
x_test_twitter = BOW.transform(twitter_test_data['text'])

pred_result_test_data = model.predict(x_test_twitter, batch_size=128)

print('x_test.shape: ', x_test_twitter.shape)
pred_result_test_data[:5]

x_test.shape:  (411972, 25000)


array([[7.2134670e-04, 5.8111531e-01, 3.5499751e-03, 2.2595141e-03,
        1.9456419e-01, 5.0527016e-03, 2.0227081e-03, 2.1071419e-01],
       [6.0479221e-04, 9.1848934e-01, 4.7078723e-04, 1.4382222e-03,
        1.4037020e-02, 3.7493396e-03, 5.4701329e-03, 5.5740338e-02],
       [2.3755964e-03, 5.7129842e-01, 1.2180364e-02, 5.7038465e-03,
        3.4989572e-01, 1.4165925e-02, 5.2738823e-03, 3.9106205e-02],
       [5.4947520e-04, 5.9566838e-01, 1.7403333e-03, 1.7238709e-03,
        2.7456447e-01, 2.3667265e-03, 2.1701441e-03, 1.2121662e-01],
       [1.7981682e-02, 4.0549445e-01, 3.6158517e-02, 3.4267507e-02,
        1.4144489e-01, 7.1892999e-02, 1.1970902e-02, 2.8078905e-01]],
      dtype=float32)

#### Decoding the values to then be able to make a dataframe with meaning

In [17]:
pred_result_test_data = label_decode(label_encoder, pred_result_test_data)
pred_result_test_data[:5]

array(['anticipation', 'anticipation', 'anticipation', 'anticipation',
       'anticipation'], dtype=object)

#### Making the DataFrame that will be uploaded to KAGGLE:

In [18]:
upload_df = pd.DataFrame(columns = [["id","emotion"]])
upload_df["id"] = twitter_test_data["tweet_id"]
upload_df["emotion"] = pred_result_test_data
upload_df

Unnamed: 0,id,emotion
0,0x28b412,anticipation
1,0x2de201,anticipation
2,0x218443,anticipation
3,0x2939d5,anticipation
4,0x26289a,anticipation
...,...,...
411967,0x2913b4,anticipation
411968,0x2a980e,sadness
411969,0x316b80,anticipation
411970,0x29d0cb,anger


In [19]:
upload_df['emotion']
upload_df.emotion.nunique()

emotion    8
dtype: int64

## Final Upload to KAGGLE:

In [20]:
upload_df.to_csv("./uploads/Keras_25k.csv",index=False)

# -------------------------------------------------------------

#### Extra try with logistic regression:

## Logistic Regression

Use this one with only the BOW and the x_train, y_train BOW transformed. (But in the target here i needed the category column).

##### To use this one, change "twitter.emotion" to "twitter.Category" in the train-split data part, and avoid modeling the data with the NN.

In [None]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(x_train, y_train)
score = classifier.score(x_test, y_test)

print("Accuracy:", score)