<a href="https://colab.research.google.com/github/ali-kmirzaei/NLP/blob/main/05-emojify.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# libs

In [None]:
!pip install emoji

In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import keras
import csv
import emoji

%matplotlib inline

In [44]:
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
from keras.layers import Embedding

# read data

In [None]:
!unzip emoji.zip

In [4]:
def read_csv(filename):
  phrase = []
  emoji = []
  with open (filename) as csvDataFile:
    csvReader = csv.reader(csvDataFile)
    for row in csvReader:
      phrase.append(row[0])
      emoji.append(row[1])
    X = np.asarray(phrase)
    Y = np.asarray(emoji, dtype=int)
    return X, Y

In [7]:
X_train, Y_train = read_csv('train_emoji.csv')
X_test, Y_test = read_csv('tesss.csv')

In [8]:
maxlen = len(max(X_train, key=len).split())
maxlen

10

# label to emoji

In [9]:
emoji_dict = {
    "0" : ":red_heart:",
    "1" : ":baseball:",
    "2" : "\U0001F604",
    "3" : "\U0001F61E",
    "4" : ":fork_and_knife:"
}

def label_to_emoji(label):
  return emoji.emojize( emoji_dict[ str(label) ] )

In [None]:
# Test
for i in range(10):
  print(X_train[i], label_to_emoji(Y_train[i]))

# read glove file as pretrained embeddings

In [11]:
def read_glove_vecs(glove_file):
  with open(glove_file, encoding="utf8") as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
      line = line.strip().split()
      curr_word = line[0]
      words.add(curr_word)
      
      if '-1.377.0693' in line:
        c = 0
        for t in line:
          if t == '-1.377.0693':
            line[c] = '-1.377'
          c += 1
      word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float)

    i = 1
    words_to_index = {}
    index_to_words = {}
    for w in sorted(words):
      words_to_index[w] = i
      index_to_words[i] = w
      i += 1
    return words_to_index, index_to_words, word_to_vec_map

In [None]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('glove.6B.50d.txt')

In [None]:
# Test
word = "ali"
index = 113317
word_to_index[word], index_to_word[index], word_to_vec_map[word]

# Sentence Embedding

In [14]:
def sentence_embedding(sentence, word_to_vec_map):
  words = sentence.lower().split()
  avg = np.zeros((50,))
  for w in words:
    avg += word_to_vec_map[w]
  avg /= len(words)
  return avg

In [None]:
# Test
sentence_embedding("Ali go to home", word_to_vec_map)

# Model v1

In [26]:
def softmax(x):
  e_x = np.exp(x-np.max(x))
  return e_x/e_x.sum()

def predict(X, Y, W, b, word_to_vec_map):
  m = X.shape[0]
  pred = np.zeros((m, 1))
  for j in range(m):
    sentence = X[j]
    avg = sentence_embedding(sentence, word_to_vec_map)
    Z = np.dot(W, avg) + b
    A = softmax(Z)
    pred[j] = np.argmax(A)
  print("ACC: "+str(np.mean((pred[:] == Y.reshape(Y.shape[0], 1)[:]))))
  return pred

def model(X, Y, word_to_vec_map, learning_rate=0.01, num_iterations=401):
  np.random.seed(1)
  m = Y.shape[0] #num of training samples
  n_y = 5 #num of calsses
  n_h = 50 #dim of glove vectors
  W = np.random.randn(n_y, n_h) / np.sqrt(n_h)
  b = np.zeros((n_y,))
  Y_oh = keras.utils.np_utils.to_categorical(Y, n_y)
  for t in range(num_iterations):
    for i in range(m):
      avg = sentence_embedding(X[i], word_to_vec_map)
      z = np.dot(W, avg) + b
      a = softmax(z)
      cost = -np.sum(Y_oh[i] * np.log(a))
      dz = a - Y_oh[i]
      dW = np.dot(dz.reshape(n_y, 1), avg.reshape(1, n_h))
      db = dz
      W = W - learning_rate * dW
      b = b - learning_rate * db
      if t % 100 == 0:
        print("Epoch: "+str(t)+" --- cost= "+str(cost))
        pred = predict(X, Y, W, b, word_to_vec_map)
  return pred, W, b

In [None]:
pred, W, b = model(X_train, Y_train, word_to_vec_map)

# Evaluate on test set

In [None]:
predict(X_train, Y_train, W, b, word_to_vec_map)
predict(X_test, Y_test, W, b, word_to_vec_map)

In [36]:
def print_pred(X, pred):
  print()
  for i in range(X.shape[0]):
    print(X[i], label_to_emoji(int(pred[i])))

In [38]:
X_my_sentence = np.array(["i adore you", "i love you", "funny lol", "lets play with a ball"])
Y_my_labels = np.array([[0], [0], [2], [1], [4], [3]])
pred = predict(X_my_sentence, Y_my_labels, W, b, word_to_vec_map)
print_pred(X_my_sentence, pred)

ACC: 0.0

i adore you ❤️
i love you ❤️
funny lol 😄
lets play with a ball ⚾


  
