In [1]:
#Importing Libraries
import numpy as np
import pandas as pd
import emoji

In [3]:
mapping = pd.read_csv("/content/drive/MyDrive/dataset/Mapping.csv")
train = pd.read_csv("/content/drive/MyDrive/dataset/Train.csv")
test = pd.read_csv("/content/drive/MyDrive/dataset/Test.csv", header = None)
out = pd.read_csv("/content/drive/MyDrive/dataset/OutputFormat.csv")

## Pre-Processing

In [4]:
mapping = mapping.drop(['Unnamed: 0'], axis = 1)

In [5]:
emoticons = mapping['emoticons'].tolist()

In [21]:
mapping_dict = {}
for emo in range(0,len(emoticons)):
    mapping_dict[emo]= emoticons[emo]

In [22]:
mapping_dict

{0: '😜',
 1: '📸',
 2: '😍',
 3: '😂',
 4: '😉',
 5: '🎄',
 6: '📷',
 7: '🔥',
 8: '😘',
 9: '❤',
 10: '😁',
 11: '🇺🇸',
 12: '☀',
 13: '✨',
 14: '💙',
 15: '💕',
 16: '😎',
 17: '😊',
 18: '💜',
 19: '💯'}

In [11]:
train = train.drop(['Unnamed: 0'], axis = 1)
test = test.drop(['Unnamed: 0'], axis = 1)

In [13]:
train

Unnamed: 0,TEXT,Label
0,Vacation wasted ! #vacation2017 #photobomb #ti...,0
1,"Oh Wynwood, you’re so funny! : @user #Wynwood ...",1
2,Been friends since 7th grade. Look at us now w...,2
3,This is what it looks like when someone loves ...,3
4,RT @user this white family was invited to a Bl...,3
...,...,...
69995,"Yes, I call Galina ""my Bubie"" Go follow my bea...",3
69996,"I SEA you, Seattle @ Ballard Seafood Festival\n",16
69997,If one of my daughters is wearing this and ask...,2
69998,Guess who whoop people on THEIR homecoming?! #...,3


In [16]:
x_train = train['TEXT'].values
y_train = train['Label'].values

In [18]:
x_train.shape

(70000,)

## Embeddings

In [11]:
f = open("/content/drive/MyDrive/embedding/glove.6B.50d.txt", encoding='utf8')

In [12]:
embedding_index = {}

for line in f:
    values = line.split()
    word = values[0]
    emb = np.array(values[1:], dtype ='float')
    embedding_index[word] = emb

In [13]:
embedding_index['america'].shape

(50,)

In [14]:
def get_embedding_output(X):
    maxLen = 20
    embedding_output = np.zeros((len(X), maxLen, 50))

    for ix in range(X.shape[0]):
        my_example = X[ix].split()

        for ij in range(len(my_example)):
            if (embedding_index.get(my_example[ij].lower()) is not None) and (ij<maxLen):
                embedding_output[ix][ij] = embedding_index[my_example[ij].lower()]

    return embedding_output

In [15]:
x_train_embed = get_embedding_output(x_train)

In [16]:
x_train_embed.shape

(70000, 20, 50)

In [17]:
from keras.utils import to_categorical

In [18]:
y_train = to_categorical(y_train)

In [19]:
y_train[1]

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.], dtype=float32)

## SMOTE - Oversampling Technique

In [20]:
from imblearn.over_sampling import SMOTE

In [21]:
x_train_embed.shape

(70000, 20, 50)

In [22]:
x_train_embed = x_train_embed.reshape(-1, 1000)

In [23]:
y_train

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [24]:
oversample = SMOTE()
X, y = oversample.fit_resample(x_train_embed, y_train)

In [25]:
X = X.reshape(-1, 20, 50)
X.shape, y.shape

((301820, 20, 50), (301820, 20))

## Split

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
x1, x2, y1, y2 = train_test_split(X, y, test_size=0.2, random_state=42)

In [72]:
x1.shape

(241456, 20, 50)

## LSTM Model

In [29]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Bidirectional

In [30]:
model = Sequential()
model.add(Bidirectional(LSTM(units = 512, return_sequences=True), input_shape = (20,50)))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(units=256)))
model.add(Dropout(0.3))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=20, activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirection  (None, 20, 1024)          2306048   
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 20, 1024)          0         
                                                                 
 bidirectional_1 (Bidirecti  (None, 512)               2623488   
 onal)                                                           
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                                 
 dense (Dense)               (None, 128)               65664     
                                                                 
 dense_1 (Dense)             (None, 64)                8

In [31]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics =['accuracy'])

In [32]:
hist = model.fit(x1, y1, validation_split=0.2, shuffle=True, batch_size=64, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
##

## Evaluation & Testing

In [33]:
model.evaluate(x2, y2)



[0.782325804233551, 0.8165296912193298]

In [37]:
test

Unnamed: 0,0,1,2
0,,id,TEXT
1,0.0,0,Thought this was cool...#Repost (get_repost)・・...
2,1.0,1,Happy 4th! Corte madera parade. #everytownusa ...
3,2.0,2,Luv. Or at least something close to it. @ Unio...
4,3.0,3,There's a slice of pie under that whipped crea...
...,...,...,...
25954,7493.0,29993,"Rest In Peace, Nana. thanks for playing cards ..."
25955,7494.0,29994,"I jus wan EAT #chowtime #swt30 @ Hartford, Con..."
25956,7495.0,29995,Playing Bloon SuperMonkey 2 #bloonsupermonkey2...
25957,7498.0,29998,Best Snow-cone I've EVER had!!! #bahamabucks #...


In [36]:
X_test = get_embedding_output(test[2].values)

In [38]:
predicted = model.predict(X_test)



In [39]:
predicted

array([[8.1792850e-06, 2.6179688e-02, 7.9176627e-02, ..., 2.5778337e-04,
        6.5214466e-05, 3.5340514e-03],
       [2.6043470e-10, 9.9992692e-01, 1.4386666e-07, ..., 3.5088412e-09,
        1.6890539e-09, 1.5152231e-08],
       [4.6679142e-04, 5.4583390e-04, 8.7764324e-04, ..., 9.4674271e-01,
        1.2883934e-05, 1.1311165e-05],
       ...,
       [1.4761041e-05, 1.0880732e-01, 6.7311717e-04, ..., 1.1505117e-04,
        5.6129134e-05, 2.9814748e-06],
       [2.3294226e-04, 6.9344766e-04, 2.7463993e-02, ..., 8.1523118e-05,
        1.3381093e-05, 9.7060757e-04],
       [2.8141070e-04, 2.9742900e-02, 5.4898858e-03, ..., 4.2968546e-03,
        1.0675840e-02, 8.0930585e-01]], dtype=float32)

In [40]:
classes = np.argmax(predicted, axis = 1)

In [46]:
classes

array([ 9,  1, 17, ...,  6,  7, 19])

In [71]:
for i in range(50):
    print(test[2].iloc[i], emoji.emojize(mapping_dict[classes[i]]))

TEXT ❤
Thought this was cool...#Repost (get_repost)・・・Colorview. by shay_images…
 📸
Happy 4th! Corte madera parade. #everytownusa #merica @ Perry's on…
 😊
Luv. Or at least something close to it. @ Union Hill, Richmond, Virginia
 😍
There's a slice of pie under that whipped cream. #HouseofPies @ House of Pies
 😂
#thankyou for your thank you We adore you both + plan on moreeeee! Hosting your #wedding was…
 ✨
the SPECIAL4U Lyric video will be posted on my youtube channel today at 6PM EST ! #Z…
 🔥
Momma Tanya's In town ! Awesome dinner @user with friends! @ Perch
 ❤
Thing 1 and Thing 2 @ Huron, Ohio
 ✨
Bday girl and some random @ Sheraton New York Times Square
 😂
Always fun with my forever wedding date Congrats @user &amp; @user
 ✨
La La Land @ Griffith Park Observatory-Los Angeles ,CA
 🎄
Friends. #Seattle @ Seattle, Washington
 💜
#GETIT #GOTIT #GOOD #WHATEVERIWANT #BOW @ Oakland, California
 😍
Hanging with my bestie for the day ️#mtlove #hyalite #daysoff #fishing #endofsummer…
 💜
Stoked to

In [66]:
messages = np.array(['I love you', 'send me pictures', 'I am feeling good'])

In [67]:
emb_messages = get_embedding_output(messages)

In [68]:
predict_messages = model.predict(X_test)



In [69]:
class_messages = np.argmax(predicted, axis = 1)

In [70]:
for i in range(len(messages)):
    print(messages[i], emoji.emojize(mapping_dict[class_messages[i]]))

I love you ❤
send me pictures 📸
I am feeling good 😊
