In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import gensim.models as gsm
import gensim.downloader
import pandas as pd
import time
import string
import nltk
nltk.download('punkt')
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


**Emoji2Vec Download**

In [None]:
e2v = gsm.KeyedVectors.load_word2vec_format('/content/drive/MyDrive/2021_NLU/emoji2vec.bin', binary=True)

In [None]:
e2v.vector_size

300

In [None]:
happy_vector = e2v['😂']  
happy_vector.shape

(300,)

In [None]:
# print(len(e2v.vocab))
# print(e2v.vocab.keys())

**Word2Vec Download**

In [None]:
word2vec = gensim.downloader.load('word2vec-google-news-300')



In [None]:
pickle.dump(word2vec, open('/content/drive/MyDrive/2021_NLU/data/full_data/word2vec.pkl', 'wb'))

In [None]:
with open('/content/drive/MyDrive/2021_NLU/data/full_data/word2vec.pkl', 'rb') as f:  
    word2vec = pickle.load(f)

## FULL DATA (SINGLE AND MULTI) - Download, Preprocess, Create Vectors

In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/2021_NLU/data/full_data/emoji_nsp_dataset_train.csv')

In [None]:
df_val = pd.read_csv('/content/drive/MyDrive/2021_NLU/data/full_data/emoji_nsp_dataset_valid.csv')

In [None]:
df_test = pd.read_csv('/content/drive/MyDrive/2021_NLU/data/full_data/emoji_nsp_dataset_test.csv')

In [None]:
df_train.head()

Unnamed: 0,index,tweets,emoji_sentence,follows?
0,50553,The dababy memes make no sense and that’s why ...,😭,1
1,74541,a year ago today i would be holding my breath ...,😵,0
2,50992,I told my mama about how the music industry is...,💯,1
3,95343,[USER] [USER] Thankyou guys,💯,0
4,60555,You want new SUBS? Like ️ Retweet Follow me R...,😂😂,0


In [None]:
df_train.count()

index             15540
tweets            15540
emoji_sentence    15540
follows?          15540
dtype: int64

In [None]:
df_train["emoji_sentence"]

0         😎
1         🤭
2         🥺
3         🙏
4         🙃
         ..
15535     🤷
15536     🤣
15537     🥺
15538    🙌🍺
15539    🙄😋
Name: emoji_sentence, Length: 15540, dtype: object

In [None]:
df_train["emoji_sentence"].isna()

0        False
1        False
2        False
3        False
4        False
         ...  
14396    False
14397    False
14398    False
14399    False
14400    False
Name: emoji_sentence, Length: 14401, dtype: bool

In [None]:
print(len(df_train))
print(len(df_val))
print(len(df_test))

df_train = df_train.dropna().drop_duplicates()
df_val = df_val.dropna().drop_duplicates()
df_test = df_test.dropna().drop_duplicates()

print(len(df_train))
print(len(df_val))
print(len(df_test))

15540
2199
4438
15540
2199
4438


###Preprocessing

In [None]:
def make_lowercase(data, debug=False):
	'''
	- input: data - list of documents
	- output: data - list of documents after lowercasing everything
	'''
	if(debug):
		print("data_sample out of ",len(data))
		print(data[:sample_to_print])
	start = time.time()
	data = [i.lower() for i in data]

	end = time.time()
	print('\n       ##### Lowercasing Done! Time Taken - ',end-start)
	return data                                                                       


def punctuation_removal(data, debug=False):
	'''
	- input: data - list of documents
	- output: data - list of documents after removing punctuation
	'''
	if(debug):
		print("data_sample out of ",len(data))
		print(data[:sample_to_print])
	start = time.time()
	data = [i.translate(str.maketrans(string.punctuation,' '*len(string.punctuation))) for i in data]
	end = time.time()
	print('\n       ##### Punctuation removed! Time Taken - ',end-start)
	return data

def whitespace_removal(data, debug=False):
	'''
	- input: data - 
	- output: data - 
	'''
	if(debug):
		print("data_sample out of ",len(data))
		print(data[:sample_to_print])
	start = time.time()
	data = [' '.join(mystring.split()) for mystring in data]
	# data = [i.strip() for i in data]
	end = time.time()
	print('\n       ##### Whitespace removed! Time Taken - ',end-start)
	return data

# TOKENIZATION with NLTK
def tokenization_nltk(data, debug=False):
	'''
	- input: data - 
	- output: data - 
	'''
	if(debug):
		print("data_sample out of ",len(data))
		print(data[:sample_to_print])
	# Using NLTK
	start = time.time()
	data = [nltk.word_tokenize(i) for i in data]
	end = time.time()
	# Using Spacy - Spacy takes too much time
	#data = [[token.text for token in nlp_spacy(i)] for i in data]
	print('\n       ##### Tokenization Done using NLTK! Time Taken - ', end-start)
	return data

# #used to search in nltk stop_words
# def BinarySearch(a, x): 
# 	i = bisect_left(a, x) 
# 	if i != len(a) and a[i] == x:
# 		return i 
# 	else: 
# 		return -1

# def stopwords_removal(data, stop_words_nltk, debug=False):
# 	'''
# 	- input: data - 
# 	- output: data - 
# 	'''
# 	if(debug):
# 		print("stopwords_removal_nltk data_sample out of ",len(data))
# 		print(data[:sample_to_print])
# 	#using NLTK
# 	start = time.time()
# 	data = [[j for j in doc if (BinarySearch(stop_words_nltk,j)<0)] for doc in data]
# 	data = [[x for x in word if not (x.isdigit() or x[0] == '-' and x[1:].isdigit())] for word in data]
# 	end = time.time()
# 	print('\n       ##### Stopwords Removed using NLTK! Time Taken - ',end-start)
# 	return data

In [None]:
def clean_text(sample, debug=False):
  '''
  sample should be a list of documents
  '''


  # sample = remove_string_with_nonASCII(sample)
  # if debug:
  #   print(sample[:2])

  # sample = preprocess_tweet_text(sample)
  # if debug:
  #   print(sample[:2])
    
  sample = make_lowercase(sample)
  if debug:
    print(sample[:2])

  sample = punctuation_removal(sample)
  if debug:
    print(sample[:2])

  sample = whitespace_removal(sample)
  if debug:
    print(sample[:2])

  sample = tokenization_nltk(sample)
  if debug:
    print(sample[:2])

  # sample = tokenization_spacy(sample)
  # if debug:
  #   print(sample[:2])

  # sample = lemmatization_tokenization_spacy(sample)
  # if debug:
    # print(sample[:2])

  # sample = stopwords_removal(sample, stop_words_nltk)
  # if debug:
  #   print(sample[:2])

  # sample = make_bigrams_gensim(sample, bigrams_min_count=10, bigrams_threshold=10) #params from gensim
  # if debug:
  #   print(sample[:2])

  sample_normal = [' '.join(i) for i in sample]
  # Sample tokenized is used for Word2Vec

  return sample, sample_normal

In [None]:
# sample_tokenized, sample_normal = clean_text(sample)
x_train_tokenized, x_train_normal = clean_text(df_train['tweets'].values)
x_val_tokenized, x_val_normal = clean_text(df_val['tweets'].values)
x_test_tokenized, x_test_normal = clean_text(df_test['tweets'].values)



       ##### Lowercasing Done! Time Taken -  0.012780904769897461

       ##### Punctuation removed! Time Taken -  0.09380507469177246

       ##### Whitespace removed! Time Taken -  0.02262401580810547

       ##### Tokenization Done using NLTK! Time Taken -  2.0198376178741455

       ##### Lowercasing Done! Time Taken -  0.00092315673828125

       ##### Punctuation removed! Time Taken -  0.013142824172973633

       ##### Whitespace removed! Time Taken -  0.003053903579711914

       ##### Tokenization Done using NLTK! Time Taken -  0.2526061534881592

       ##### Lowercasing Done! Time Taken -  0.0018680095672607422

       ##### Punctuation removed! Time Taken -  0.02589106559753418

       ##### Whitespace removed! Time Taken -  0.006514072418212891

       ##### Tokenization Done using NLTK! Time Taken -  0.5394716262817383


In [None]:
y_train = df_train['follows?'].values
y_val = df_val['follows?'].values
y_test = df_test['follows?'].values

In [None]:
#remove empty values for train
y_train_2 = []
x_train_tokenized_2 = []
x_train_normal_2 = []
empty_indices_train = []

for i in range(len(x_train_tokenized)):
  if len(x_train_tokenized[i])==0:
    empty_indices_train.append(i)
  else:
    x_train_tokenized_2.append(x_train_tokenized[i])
    x_train_normal_2.append(x_train_normal[i])
    y_train_2.append(y_train[i])

In [None]:
#remove empty values for val
y_val_2 = []
x_val_tokenized_2 = []
x_val_normal_2 = []
empty_indices_val = []

for i in range(len(x_val_tokenized)):
  if len(x_val_tokenized[i])==0:
    empty_indices_val.append(i)
  else:
    x_val_tokenized_2.append(x_val_tokenized[i])
    x_val_normal_2.append(x_val_normal[i])
    y_val_2.append(y_val[i])

In [None]:
#remove empty values for test
y_test_2 = []
x_test_tokenized_2 = []
x_test_normal_2 = []
empty_indices_test = []

for i in range(len(x_test_tokenized)):
  if len(x_test_tokenized[i])==0:
    empty_indices_test.append(i)
  else:
    x_test_tokenized_2.append(x_test_tokenized[i])
    x_test_normal_2.append(x_test_normal[i])
    y_test_2.append(y_test[i])

In [None]:
print(empty_indices_train)
print(len(empty_indices_train))
print(len(x_train_tokenized))
print(len(x_train_tokenized_2))
print(len(x_train_normal_2))
print(len(y_train_2))

print(empty_indices_val)
print(len(empty_indices_val))
print(len(x_val_tokenized))
print(len(x_val_tokenized_2))
print(len(x_val_normal_2))
print(len(y_val_2))

print(empty_indices_test)
print(len(empty_indices_test))
print(len(x_test_tokenized))
print(len(x_test_tokenized_2))
print(len(x_test_normal_2))
print(len(y_test_2))

[]
0
15540
15540
15540
15540
[]
0
2199
2199
2199
2199
[]
0
4438
4438
4438
4438


### Save tokens

In [None]:
# Saving the objects:
with open('/content/drive/MyDrive/2021_NLU/data/full_data/full_data.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump([x_train_tokenized, x_train_normal, x_val_tokenized, x_val_normal, x_test_tokenized, x_test_normal, empty_indices_train, x_train_tokenized_2, \
                 x_train_normal_2, empty_indices_val, x_val_tokenized_2, x_val_normal_2, y_train, y_val, y_train_2, y_val_2, y_test, y_test_2,\
                 empty_indices_test, x_test_normal_2, x_test_tokenized_2 ], f)

In [None]:
with open('/content/drive/MyDrive/2021_NLU/data/full_data/full_data.pkl', 'rb') as f:  
    x_train_tokenized, x_train_normal, x_val_tokenized, x_val_normal, x_test_tokenized, x_test_normal, empty_indices_train, x_train_tokenized_2, \
                 x_train_normal_2, empty_indices_val, x_val_tokenized_2, x_val_normal_2, y_train, y_val, y_train_2, y_val_2, y_test, y_test_2,\
                 empty_indices_test, x_test_normal_2, x_test_tokenized_2 = pickle.load(f)

In [None]:
# def create_df(xdata, emojidata, ydata):
#   temp=[" ".join(i) for i in xdata]
#   df_new = pd.DataFrame(temp)
#   df_new["Emoji"] = emojidata
#   df_new["Target"] = ydata

#   df_new.columns = ["Tweet", "Emoji", "Target"]
#   return df_new


In [None]:
# df_new_val = create_df(x_val_tokenized_2,  y_val_2)
# df_new_train = create_df(x_train_tokenized_2, y_train_2)
# df_new_test = create_df(x_test_tokenized_2, df_test['emoji_sentence'], y_test_2)

In [None]:
# df_new_test.head()

In [None]:
df_test['tokenized_tweets'] = x_test_tokenized
df_test['tokenized_len'] = df_test['tokenized_tweets'].apply(lambda x: len(x))
print(df_test['tokenized_len'].mean())
print(df_test['tokenized_len'].median())

14.242000901306895
10.0


In [None]:
# # Saving the objects:
# with open('/content/drive/MyDrive/2021_NLU/data/full_data/df_new.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
#     pickle.dump([df_new_val, df_new_train, df_new_test], f)

In [None]:
# with open('/content/drive/MyDrive/2020 NLP/Project/df_new.pkl', 'rb') as f:  
#     df_new_val, df_new_train, df_new_test = pickle.load(f)

### Create Vectors

In [None]:
def convert_word2vec(model, corpus, strategy):
  # return [[model[token] for token in sentence] for sentence in corpus]
  output = []
  for sentence in corpus:
    vector_ = np.zeros(model.vector_size)
    for token in sentence:
      try:
        token_vector = model[token]
        vector_ = vector_ + token_vector
      except:
        vector_ = vector_ + np.zeros(model.vector_size)
    if strategy=='mean':
      vector_ = vector_/len(sentence)
    elif strategy=='add':
      pass
    output.append(vector_)
  # output is a list
  return output

In [None]:
X_train_w2vec = convert_word2vec(word2vec, x_train_tokenized_2, strategy='mean')
X_val_w2vec = convert_word2vec(word2vec, x_val_tokenized_2, strategy='mean')
X_test_w2vec = convert_word2vec(word2vec, x_test_tokenized_2, strategy='mean')

In [None]:
# list(df_train['emoji_sentence'][2])
df_train['emoji_sentence_list'] = df_train['emoji_sentence'].apply(lambda x: list(x))
emoji_corpus_train = df_train['emoji_sentence_list'].values

df_val['emoji_sentence_list'] = df_val['emoji_sentence'].apply(lambda x: list(x))
emoji_corpus_val = df_val['emoji_sentence_list'].values

df_test['emoji_sentence_list'] = df_test['emoji_sentence'].apply(lambda x: list(x))
emoji_corpus_test = df_test['emoji_sentence_list'].values

In [None]:
def convert_emoji2vec(model, corpus, strategy):
  # return [[model[token] for token in sentence] for sentence in corpus]
  output = []
  for emojis in corpus:
    vector_ = np.zeros(model.vector_size)
    for emoji in emojis:
      try:
        token_vector = model[emoji]
        vector_ = vector_ + token_vector
      except:
        vector_ = vector_ + np.zeros(model.vector_size)
    if strategy=='mean':
      vector_ = vector_/len(emojis)
    elif strategy=='add':
      pass
    output.append(vector_)
  # output is a list
  return output

In [None]:
X_train_e2vec = convert_emoji2vec(e2v, emoji_corpus_train, strategy='mean')
X_val_e2vec = convert_emoji2vec(e2v, emoji_corpus_val, strategy='mean')
X_test_e2vec = convert_emoji2vec(e2v, emoji_corpus_test, strategy='mean')

In [None]:
print(len(X_train_w2vec))
print(len(X_train_w2vec[4]))

print(len(X_val_w2vec))
print(len(X_val_w2vec[4]))

print(len(X_test_w2vec))
print(len(X_test_w2vec[4]))

15540
300
2199
300
4438
300


In [None]:
print(len(X_train_e2vec))
print(len(X_train_e2vec[4]))

print(len(X_val_e2vec))
print(len(X_val_e2vec[4]))

print(len(X_test_e2vec))
print(len(X_test_e2vec[4]))

15540
300
2199
300
4438
300


#### Averaged Vectors 

In [None]:
X_train_vec = (np.array(X_train_w2vec) + np.array(X_train_e2vec))/2
X_val_vec = (np.array(X_val_w2vec) + np.array(X_val_e2vec))/2
X_test_vec = (np.array(X_test_w2vec) + np.array(X_test_e2vec))/2

In [None]:
X_train_w2vec[0][:4]

array([ 0.04621582, -0.00656535,  0.04308268,  0.0743042 ])

In [None]:
X_train_e2vec[0][:4]

array([ 0.03221022,  0.03802984, -0.00126745,  0.07279918])

In [None]:
X_train_vec[0][:4]

array([0.03921302, 0.01573225, 0.02090762, 0.07355169])

In [None]:
len(X_train_vec)

15540

#### Concantenated Vectors

In [None]:
X_train_vec_concat = np.concatenate((X_train_w2vec, X_train_e2vec), axis=1) 
X_val_vec_concat  = np.concatenate((X_val_w2vec, X_val_e2vec), axis=1) 
X_test_vec_concat  = np.concatenate((X_test_w2vec, X_test_e2vec), axis=1) 

In [None]:
print(len(X_train_w2vec))
print(len(X_train_e2vec))
print(len(X_train_vec_concat))
print(len(X_train_w2vec[4]))
print(len(X_train_vec_concat[4]))

15540
15540
15540
300
600


In [None]:
len(X_train_vec_concat)

15540

In [None]:
X_train_vec_concat.shape

(15540, 600)

### Save Vectors

In [None]:
# Saving the objects:
with open('/content/drive/MyDrive/2021_NLU/data/full_data/full_vec.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump([X_train_vec, X_val_vec, X_test_vec], f)

In [None]:
with open('/content/drive/MyDrive/2021_NLU/data/full_data/full_vec.pkl', 'rb') as f:  
    X_train_vec, X_val_vec, X_test_vec = pickle.load(f)

In [None]:
# Saving the objects:
with open('/content/drive/MyDrive/2021_NLU/data/full_data/full_vec_concat.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump([X_train_vec_concat, X_val_vec_concat, X_test_vec_concat], f)

In [None]:
with open('/content/drive/MyDrive/2021_NLU/data/full_data/full_vec.pkl', 'rb') as f:  
    X_train_vec_concat, X_val_vec_concat, X_test_vec_concat = pickle.load(f)

## (ONLY) SINGLE EMOJI - Download, Preprocess, Create Vectors

In [None]:
df_sing_train = pd.read_csv('/content/drive/MyDrive/2021_NLU/data/single_emoji/emoji_nsp_dataset_single_emoji_train.csv')
df_sing_val = pd.read_csv('/content/drive/MyDrive/2021_NLU/data/single_emoji/emoji_nsp_dataset_single_emoji_valid.csv')
df_sing_test = pd.read_csv('/content/drive/MyDrive/2021_NLU/data/single_emoji/emoji_nsp_dataset_single_emoji_test.csv')

In [None]:
df_sing_train.head()

Unnamed: 0,index,tweets,emoji_sentence,follows?
0,67113,Craving Black Cake,,0
1,35900,"y’all i was kidding, pls don’t attack me",😭,1
2,1045,omg bye so true,😭,1
3,11764,Nooo I’m at the end of the og Futurama eps,😢,1
4,1642,[USER] [USER] [USER] [USER] Not worth wasting ...,😍,1


In [None]:
print(len(df_sing_train))
print(len(df_sing_val))
print(len(df_sing_test))

df_sing_train = df_sing_train.dropna().drop_duplicates()
df_sing_val = df_sing_val.dropna().drop_duplicates()
df_sing_test = df_sing_test.dropna().drop_duplicates()

print(len(df_sing_train))
print(len(df_sing_val))
print(len(df_sing_test))

15015
2153
4341
15015
2153
4341


In [None]:
# sample_tokenized, sample_normal = clean_text(sample)
x_train_sing_tokenized, x_train_sing_normal = clean_text(df_sing_train['tweets'].values)
x_val_sing_tokenized, x_val_sing_normal = clean_text(df_sing_val['tweets'].values)
x_test_sing_tokenized, x_test_sing_normal = clean_text(df_sing_test['tweets'].values)


       ##### Lowercasing Done! Time Taken -  0.011325836181640625

       ##### Punctuation removed! Time Taken -  0.07462406158447266

       ##### Whitespace removed! Time Taken -  0.020409584045410156

       ##### Tokenization Done using NLTK! Time Taken -  1.7219054698944092

       ##### Lowercasing Done! Time Taken -  0.0008127689361572266

       ##### Punctuation removed! Time Taken -  0.009940624237060547

       ##### Whitespace removed! Time Taken -  0.0026712417602539062

       ##### Tokenization Done using NLTK! Time Taken -  0.24480819702148438

       ##### Lowercasing Done! Time Taken -  0.0017445087432861328

       ##### Punctuation removed! Time Taken -  0.023852109909057617

       ##### Whitespace removed! Time Taken -  0.006357908248901367

       ##### Tokenization Done using NLTK! Time Taken -  0.4971282482147217


In [None]:
y_sing_train = df_sing_train['follows?'].values
y_sing_val = df_sing_val['follows?'].values
y_sing_test = df_sing_test['follows?'].values

In [None]:
#remove empty values for train
y_train_sing_2 = []
x_train_sing_tokenized_2 = []
x_train_sing_normal_2 = []
empty_indices_sing_train = []

for i in range(len(x_train_sing_tokenized)):
  if len(x_train_sing_tokenized[i])==0:
    empty_indices_sing_train.append(i)
  else:
    x_train_sing_tokenized_2.append(x_train_sing_tokenized[i])
    x_train_sing_normal_2.append(x_train_sing_normal[i])
    y_train_sing_2.append(y_sing_train[i])

In [None]:
#remove empty values for val
y_val_sing_2 = []
x_val_sing_tokenized_2 = []
x_val_sing_normal_2 = []
empty_indices_sing_val = []

for i in range(len(x_val_sing_tokenized)):
  if len(x_val_sing_tokenized[i])==0:
    empty_indices_sing_val.append(i)
  else:
    x_val_sing_tokenized_2.append(x_val_sing_tokenized[i])
    x_val_sing_normal_2.append(x_val_sing_normal[i])
    y_val_sing_2.append(y_sing_val[i])

In [None]:
#remove empty values for test
y_test_sing_2 = []
x_test_sing_tokenized_2 = []
x_test_sing_normal_2 = []
empty_indices_sing_test = []

for i in range(len(x_test_sing_tokenized)):
  if len(x_test_sing_tokenized[i])==0:
    empty_indices_sing_test.append(i)
  else:
    x_test_sing_tokenized_2.append(x_test_sing_tokenized[i])
    x_test_sing_normal_2.append(x_test_sing_normal[i])
    y_test_sing_2.append(y_sing_test[i])

In [None]:
print(empty_indices_sing_train)
print(len(empty_indices_sing_train))
print(len(x_train_sing_tokenized))
print(len(x_train_sing_tokenized_2))
print(len(x_train_sing_normal_2))
print(len(y_train_sing_2))

print(empty_indices_sing_val)
print(len(empty_indices_sing_val))
print(len(x_val_sing_tokenized))
print(len(x_val_sing_tokenized_2))
print(len(x_val_sing_normal_2))
print(len(y_val_sing_2))

print(empty_indices_sing_test)
print(len(empty_indices_sing_test))
print(len(x_test_sing_tokenized))
print(len(x_test_sing_tokenized_2))
print(len(x_test_sing_normal_2))
print(len(y_test_sing_2))

[]
0
15015
15015
15015
15015
[]
0
2153
2153
2153
2153
[]
0
4341
4341
4341
4341


In [None]:
# Saving the objects:
with open('/content/drive/MyDrive/2021_NLU/data/single_emoji/single_emoji.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump([x_train_sing_tokenized, x_train_sing_normal, x_val_sing_tokenized, x_val_sing_normal, x_test_sing_tokenized, x_test_sing_normal, empty_indices_sing_train, x_train_sing_tokenized_2, \
                 x_train_sing_normal_2, empty_indices_sing_val, x_val_sing_tokenized_2, x_val_sing_normal_2, y_sing_train, y_sing_val, y_train_sing_2, y_val_sing_2, y_sing_test, y_test_sing_2,\
                 empty_indices_sing_test, x_test_sing_normal_2, x_test_sing_tokenized_2 ], f)

In [None]:
with open('/content/drive/MyDrive/2021_NLU/data/single_emoji/single_emoji.pkl', 'rb') as f:  
    x_train_sing_tokenized, x_train_sing_normal, x_val_sing_tokenized, x_val_sing_normal, x_test_sing_tokenized, x_test_sing_normal, empty_indices_sing_train, x_train_sing_tokenized_2, \
                 x_train_sing_normal_2, empty_indices_sing_val, x_val_sing_tokenized_2, x_val_sing_normal_2, y_sing_train, y_sing_val, y_train_sing_2, y_val_sing_2, y_sing_test, y_test_sing_2,\
                 empty_indices_sing_test, x_test_sing_normal_2, x_test_sing_tokenized_2 = pickle.load(f)

In [None]:
len(x_train_sing_tokenized_2)

15015

In [None]:
X_train_sing_w2vec = convert_word2vec(word2vec, x_train_sing_tokenized_2, strategy='mean')
X_val_sing_w2vec = convert_word2vec(word2vec, x_val_sing_tokenized_2, strategy='mean')
X_test_sing_w2vec = convert_word2vec(word2vec, x_test_sing_tokenized_2, strategy='mean')

In [None]:
# list(df_train['emoji_sentence'][2])
df_sing_train['emoji_sentence_list'] = df_sing_train['emoji_sentence'].apply(lambda x: list(x))
emoji_corpus_sing_train = df_sing_train['emoji_sentence_list'].values

df_sing_val['emoji_sentence_list'] = df_sing_val['emoji_sentence'].apply(lambda x: list(x))
emoji_corpus_sing_val = df_sing_val['emoji_sentence_list'].values

df_sing_test['emoji_sentence_list'] = df_sing_test['emoji_sentence'].apply(lambda x: list(x))
emoji_corpus_sing_test = df_sing_test['emoji_sentence_list'].values

In [None]:
X_train_sing_e2vec = convert_emoji2vec(e2v, emoji_corpus_sing_train, strategy='mean')
X_val_sing_e2vec = convert_emoji2vec(e2v, emoji_corpus_sing_val, strategy='mean')
X_test_sing_e2vec = convert_emoji2vec(e2v, emoji_corpus_sing_test, strategy='mean')

In [None]:
print(len(X_train_sing_w2vec))
print(len(X_train_sing_w2vec[4]))

print(len(X_val_sing_w2vec))
print(len(X_val_sing_w2vec[4]))

print(len(X_test_sing_w2vec))
print(len(X_test_sing_w2vec[4]))

15015
300
2153
300
4341
300


In [None]:
print(len(X_train_sing_e2vec))
print(len(X_train_sing_e2vec[4]))

print(len(X_val_sing_e2vec))
print(len(X_val_sing_e2vec[4]))

print(len(X_test_sing_e2vec))
print(len(X_test_sing_e2vec[4]))

15015
300
2153
300
4341
300


#### Averaged Vectors

In [None]:
X_train_sing_vec = (np.array(X_train_sing_w2vec) + np.array(X_train_sing_e2vec))/2
X_val_sing_vec = (np.array(X_val_sing_w2vec) + np.array(X_val_sing_e2vec))/2
X_test_sing_vec = (np.array(X_test_sing_w2vec) + np.array(X_test_sing_e2vec))/2

In [None]:
# Saving the objects:
with open('/content/drive/MyDrive/2021_NLU/data/single_emoji/single_emoji_vec.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump([X_train_sing_vec, X_val_sing_vec, X_test_sing_vec], f)

In [None]:
with open('/content/drive/MyDrive/2021_NLU/data/single_emoji/single_emoji_vec.pkl', 'rb') as f:  
    X_train_sing_vec, X_val_sing_vec, X_test_sing_vec = pickle.load(f)

#### Concatenated Vectors

In [None]:
X_train_sing_vec_concat = np.concatenate((X_train_sing_w2vec, X_train_sing_e2vec), axis=1) 
X_val_sing_vec_concat  = np.concatenate((X_val_sing_w2vec, X_val_sing_e2vec), axis=1) 
X_test_sing_vec_concat  = np.concatenate((X_test_sing_w2vec, X_test_sing_e2vec), axis=1) 

In [None]:
# Saving the objects:
with open('/content/drive/MyDrive/2021_NLU/data/single_emoji/single_emoji_vec_concat.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump([X_train_sing_vec_concat, X_val_sing_vec_concat, X_test_sing_vec_concat], f)

In [None]:
with open('/content/drive/MyDrive/2021_NLU/data/single_emoji/single_emoji_vec_concat.pkl', 'rb') as f:  
    X_train_sing_vec_concat, X_val_sing_vec_concat, X_test_sing_vec_concat = pickle.load(f)

## (ONLY) MULTI EMOJIS WITH REPEATS - Download, Preprocess, Create Vectors

In [None]:
df_mul_repeat_train = pd.read_csv('/content/drive/MyDrive/2021_NLU/data/multi_emoji/emoji_nsp_dataset_multi_emoji_train.csv')
df_mul_repeat_val = pd.read_csv('/content/drive/MyDrive/2021_NLU/data/multi_emoji/emoji_nsp_dataset_multi_emoji_valid.csv')
df_mul_repeat_test = pd.read_csv('/content/drive/MyDrive/2021_NLU/data/multi_emoji/emoji_nsp_dataset_multi_emoji_test.csv')

In [None]:
df_mul_repeat_train.head()

Unnamed: 0,index,tweets,emoji_sentence,follows?
0,6704,good morning [USER] you woke up so early anyw...,😶😊💚,1
1,10651,MVP 4x Scoring champ 3x All NBA FIRST TEAM 3...,✅✅✅✅,1
2,6345,[USER] Your tears baby boy ...Sleep sad,🌚🌚👌,1
3,5268,15 php ---&gt; Follow [USER] ---&gt; RT &amp; ...,💗📌,1
4,5897,this emoji looks like sia,😭🍙🍙🍙🍙🍙🍙🍙🍙🍙🍙🍙🍙🍙🍙🍙🍙🍙🍙🍙🍙🍙,1


In [None]:
print(len(df_mul_repeat_train))
print(len(df_mul_repeat_val))
print(len(df_mul_repeat_test))

df_mul_repeat_train = df_mul_repeat_train.dropna().drop_duplicates()
df_mul_repeat_val = df_mul_repeat_val.dropna().drop_duplicates()
df_mul_repeat_test = df_mul_repeat_test.dropna().drop_duplicates()

print(len(df_mul_repeat_train))
print(len(df_mul_repeat_val))
print(len(df_mul_repeat_test))

16801
2401
4800
16801
2401
4800


In [None]:
# sample_tokenized, sample_normal = clean_text(sample)
x_train_mul_repeat_tokenized, x_train_mul_repeat_normal = clean_text(df_mul_repeat_train['tweets'].values)
x_val_mul_repeat_tokenized, x_val_mul_repeat_normal = clean_text(df_mul_repeat_val['tweets'].values)
x_test_mul_repeat_tokenized, x_test_mul_repeat_normal = clean_text(df_mul_repeat_test['tweets'].values)


       ##### Lowercasing Done! Time Taken -  0.010810375213623047

       ##### Punctuation removed! Time Taken -  0.1181480884552002

       ##### Whitespace removed! Time Taken -  0.03280997276306152

       ##### Tokenization Done using NLTK! Time Taken -  2.2395598888397217

       ##### Lowercasing Done! Time Taken -  0.0016427040100097656

       ##### Punctuation removed! Time Taken -  0.01767277717590332

       ##### Whitespace removed! Time Taken -  0.004660844802856445

       ##### Tokenization Done using NLTK! Time Taken -  0.321491003036499

       ##### Lowercasing Done! Time Taken -  0.0034494400024414062

       ##### Punctuation removed! Time Taken -  0.03859567642211914

       ##### Whitespace removed! Time Taken -  0.009351968765258789

       ##### Tokenization Done using NLTK! Time Taken -  0.6670784950256348


In [None]:
y_mul_repeat_train = df_mul_repeat_train['follows?'].values
y_mul_repeat_val = df_mul_repeat_val['follows?'].values
y_mul_repeat_test = df_mul_repeat_test['follows?'].values

In [None]:
df_mul_repeat_test.shape

(4789, 3)

In [None]:
df_mul_repeat_test.head()

Unnamed: 0,index,tweets,emoji_sentence,follows?
0,20092,[USER] CONGRATS! This is...SO CUTE...//clenche...,💎🙌🚀,0
1,12536,Hair Appt Booked Just Need To Go Shopping For...,✊🏽,0
2,394,[USER] Lmao so true 3 for y'all 1 for usHoping...,💙✨,1
3,8301,[USER] Happy happy birthday!!! . Your day soun...,🎉🎉🎉🤯,1
4,1226,My coworker hates going to restaurants with me...,🍮💜,1


In [None]:
print(len(x_test_mul_repeat_tokenized))
print(len(y_mul_repeat_test))

4789
4789


In [None]:
#remove empty values for train
y_train_mul_repeat_2 = []
x_train_mul_repeat_tokenized_2 = []
x_train_mul_repeat_normal_2 = []
empty_indices_mul_repeat_train = []

for i in range(len(x_train_mul_repeat_tokenized)):
  if len(x_train_mul_repeat_tokenized[i])==0:
    empty_indices_mul_repeat_train.append(i)
  else:
    x_train_mul_repeat_tokenized_2.append(x_train_mul_repeat_tokenized[i])
    x_train_mul_repeat_normal_2.append(x_train_mul_repeat_normal[i])
    y_train_mul_repeat_2.append(y_mul_repeat_train[i])

In [None]:
#remove empty values for val 
y_val_mul_repeat_2 = []
x_val_mul_repeat_tokenized_2 = []
x_val_mul_repeat_normal_2 = []
empty_indices_mul_repeat_val = []

for i in range(len(x_val_mul_repeat_tokenized)):
  if len(x_val_mul_repeat_tokenized[i])==0:
    empty_indices_mul_repeat_val.append(i)
  else:
    x_val_mul_repeat_tokenized_2.append(x_val_mul_repeat_tokenized[i])
    x_val_mul_repeat_normal_2.append(x_val_mul_repeat_normal[i])
    y_val_mul_repeat_2.append(y_mul_repeat_val[i])

In [None]:
#remove empty values for test
y_test_mul_repeat_2 = []
x_test_mul_repeat_tokenized_2 = []
x_test_mul_repeat_normal_2 = []
empty_indices_mul_repeat_test = []

for i in range(len(x_test_mul_repeat_tokenized)):
  if len(x_test_mul_repeat_tokenized[i])==0:
    empty_indices_mul_repeat_test.append(i)
  else:
    x_test_mul_repeat_tokenized_2.append(x_test_mul_repeat_tokenized[i])
    x_test_mul_repeat_normal_2.append(x_test_mul_repeat_normal[i])
    y_test_mul_repeat_2.append(y_mul_repeat_test[i])

In [None]:
print(empty_indices_mul_repeat_train)
print(len(empty_indices_mul_repeat_train))
print(len(x_train_mul_repeat_tokenized))
print(len(x_train_mul_repeat_tokenized_2))
print(len(x_train_mul_repeat_normal_2))
print(len(y_train_mul_repeat_2))

print(empty_indices_mul_repeat_val)
print(len(empty_indices_mul_repeat_val))
print(len(x_val_mul_repeat_tokenized))
print(len(x_val_mul_repeat_tokenized_2))
print(len(x_val_mul_repeat_normal_2))
print(len(y_val_mul_repeat_2))

print(empty_indices_mul_repeat_test)
print(len(empty_indices_mul_repeat_test))
print(len(x_test_mul_repeat_tokenized))
print(len(x_test_mul_repeat_tokenized_2))
print(len(x_test_mul_repeat_normal_2))
print(len(y_test_mul_repeat_2))

[]
0
16801
16801
16801
16801
[]
0
2401
2401
2401
2401
[]
0
4800
4800
4800
4800


In [None]:
# Saving the objects:
with open('/content/drive/MyDrive/2021_NLU/data/multi_emoji/mul_repeat_emojis.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump([x_train_mul_repeat_tokenized, x_train_mul_repeat_normal, x_val_mul_repeat_tokenized, x_val_mul_repeat_normal, x_test_mul_repeat_tokenized, x_test_mul_repeat_normal, empty_indices_mul_repeat_train, x_train_mul_repeat_tokenized_2, \
                 x_train_mul_repeat_normal_2, empty_indices_mul_repeat_val, x_val_mul_repeat_tokenized_2, x_val_mul_repeat_normal_2, y_mul_repeat_train, y_mul_repeat_val, y_train_mul_repeat_2, y_val_mul_repeat_2, y_mul_repeat_test, y_test_mul_repeat_2,\
                 empty_indices_mul_repeat_test, x_test_mul_repeat_normal_2, x_test_mul_repeat_tokenized_2], f)

In [None]:
with open('/content/drive/MyDrive/2021_NLU/data/multi_emoji/mul_repeat_emojis.pkl', 'rb') as f:  
    x_train_mul_repeat_tokenized, x_train_mul_repeat_normal, x_val_mul_repeat_tokenized, x_val_mul_repeat_normal, x_test_mul_repeat_tokenized, x_test_mul_repeat_normal, empty_indices_mul_repeat_train, x_train_mul_repeat_tokenized_2, \
                 x_train_mul_repeat_normal_2, empty_indices_mul_repeat_val, x_val_mul_repeat_tokenized_2, x_val_mul_repeat_normal_2, y_mul_repeat_train, y_mul_repeat_val, y_train_mul_repeat_2, y_val_mul_repeat_2, y_mul_repeat_test, y_test_mul_repeat_2,\
                 empty_indices_mul_repeat_test, x_test_mul_repeat_normal_2, x_test_mul_repeat_tokenized_2 = pickle.load(f)

In [None]:
X_train_mul_repeat_w2vec = convert_word2vec(word2vec, x_train_mul_repeat_tokenized_2, strategy='mean')
X_val_mul_repeat_w2vec = convert_word2vec(word2vec, x_val_mul_repeat_tokenized_2, strategy='mean')
X_test_mul_repeat_w2vec = convert_word2vec(word2vec, x_test_mul_repeat_tokenized_2, strategy='mean')

In [None]:
# list(df_train['emoji_sentence'][2])
df_mul_repeat_train['emoji_sentence_list'] = df_mul_repeat_train['emoji_sentence'].apply(lambda x: list(x))
emoji_corpus_mul_repeat_train = df_mul_repeat_train['emoji_sentence_list'].values

df_mul_repeat_val['emoji_sentence_list'] = df_mul_repeat_val['emoji_sentence'].apply(lambda x: list(x))
emoji_corpus_mul_repeat_val = df_mul_repeat_val['emoji_sentence_list'].values

df_mul_repeat_test['emoji_sentence_list'] = df_mul_repeat_test['emoji_sentence'].apply(lambda x: list(x))
emoji_corpus_mul_repeat_test = df_mul_repeat_test['emoji_sentence_list'].values

In [None]:
X_train_mul_repeat_e2vec = convert_emoji2vec(e2v, emoji_corpus_mul_repeat_train, strategy='mean')
X_val_mul_repeat_e2vec = convert_emoji2vec(e2v, emoji_corpus_mul_repeat_val, strategy='mean')
X_test_mul_repeat_e2vec = convert_emoji2vec(e2v, emoji_corpus_mul_repeat_test, strategy='mean')

In [None]:
print(len(X_train_mul_repeat_w2vec))
print(len(X_train_mul_repeat_w2vec[4]))

print(len(X_val_mul_repeat_w2vec))
print(len(X_val_mul_repeat_w2vec[4]))

print(len(X_test_mul_repeat_w2vec))
print(len(X_test_mul_repeat_w2vec[4]))

16801
300
2401
300
4800
300


In [None]:
print(len(X_train_mul_repeat_e2vec))
print(len(X_train_mul_repeat_e2vec[4]))

print(len(X_val_mul_repeat_e2vec))
print(len(X_val_mul_repeat_e2vec[4]))

print(len(X_test_mul_repeat_e2vec))
print(len(X_test_mul_repeat_e2vec[4]))

16801
300
2401
300
4800
300


#### Averaged Vectors

In [None]:
X_train_mul_repeat_vec = (np.array(X_train_mul_repeat_w2vec) + np.array(X_train_mul_repeat_e2vec))/2
X_val_mul_repeat_vec = (np.array(X_val_mul_repeat_w2vec) + np.array(X_val_mul_repeat_e2vec))/2
X_test_mul_repeat_vec = (np.array(X_test_mul_repeat_w2vec) + np.array(X_test_mul_repeat_e2vec))/2

In [None]:
# Saving the objects:
with open('/content/drive/MyDrive/2021_NLU/data/multi_emoji/mul_repeat_emojis_vec.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump([X_train_mul_repeat_vec, X_val_mul_repeat_vec, X_test_mul_repeat_vec], f)

In [None]:
with open('/content/drive/MyDrive/2021_NLU/data/multi_emoji/mul_repeat_emojis_vec.pkl', 'rb') as f:  
    X_train_mul_repeat_vec, X_val_mul_repeat_vec, X_test_mul_repeat_vec = pickle.load(f)

#### Concatenated Vectors

In [None]:
X_train_mul_repeat_vec_concat = np.concatenate((X_train_mul_repeat_w2vec, X_train_mul_repeat_e2vec), axis=1) 
X_val_mul_repeat_vec_concat  = np.concatenate((X_val_mul_repeat_w2vec, X_val_mul_repeat_e2vec), axis=1) 
X_test_mul_repeat_vec_concat  = np.concatenate((X_test_mul_repeat_w2vec, X_test_mul_repeat_e2vec), axis=1) 

In [None]:
# Saving the objects:
with open('/content/drive/MyDrive/2021_NLU/data/multi_emoji/mul_repeat_emojis_vec.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump([X_train_mul_repeat_vec_concat, X_val_mul_repeat_vec_concat, X_test_mul_repeat_vec_concat], f)

In [None]:
with open('/content/drive/MyDrive/2021_NLU/data/multi_emoji/mul_repeat_emojis_vec.pkl', 'rb') as f:  
    X_train_mul_repeat_vec_concat, X_val_mul_repeat_vec_concat, X_test_mul_repeat_vec_concat = pickle.load(f)

## FULL DATA (SINGLE AND MULTI) WITH NO REPEATS - Download, Preprocess, Create Vectors

In [None]:
df_mul_train = pd.read_csv('/content/drive/MyDrive/2021_NLU/data/no_repeats/emoji_nsp_dataset_no_repeats_train.csv')
df_mul_val = pd.read_csv('/content/drive/MyDrive/2021_NLU/data/no_repeats/emoji_nsp_dataset_no_repeats_valid.csv')
df_mul_test = pd.read_csv('/content/drive/MyDrive/2021_NLU/data/no_repeats/emoji_nsp_dataset_no_repeats_test.csv')

In [None]:
df_mul_train.head()

Unnamed: 0,index,tweets,emoji_sentence,follows?
0,50553,The dababy memes make no sense and that’s why ...,😭,1
1,74541,a year ago today i would be holding my breath ...,🙏,0
2,50992,I told my mama about how the music industry is...,💯,1
3,95343,[USER] [USER] Thankyou guys,💀,0
4,60555,You want new SUBS? Like ️ Retweet Follow me R...,😬,0


In [None]:
print(len(df_mul_train))
print(len(df_mul_val))
print(len(df_mul_test))

df_mul_train = df_mul_train.dropna().drop_duplicates()
df_mul_val = df_mul_val.dropna().drop_duplicates()
df_mul_test = df_mul_test.dropna().drop_duplicates()

print(len(df_mul_train))
print(len(df_mul_val))
print(len(df_mul_test))

15475
2229
4433
15475
2229
4433


In [None]:
# sample_tokenized, sample_normal = clean_text(sample)
x_train_mul_tokenized, x_train_mul_normal = clean_text(df_mul_train['tweets'].values)
x_val_mul_tokenized, x_val_mul_normal = clean_text(df_mul_val['tweets'].values)
x_test_mul_tokenized, x_test_mul_normal = clean_text(df_mul_test['tweets'].values)


       ##### Lowercasing Done! Time Taken -  0.014504194259643555

       ##### Punctuation removed! Time Taken -  0.08419013023376465

       ##### Whitespace removed! Time Taken -  0.024480104446411133

       ##### Tokenization Done using NLTK! Time Taken -  1.8592925071716309

       ##### Lowercasing Done! Time Taken -  0.000911712646484375

       ##### Punctuation removed! Time Taken -  0.012573957443237305

       ##### Whitespace removed! Time Taken -  0.0032439231872558594

       ##### Tokenization Done using NLTK! Time Taken -  0.26091432571411133

       ##### Lowercasing Done! Time Taken -  0.0018279552459716797

       ##### Punctuation removed! Time Taken -  0.024820804595947266

       ##### Whitespace removed! Time Taken -  0.008264541625976562

       ##### Tokenization Done using NLTK! Time Taken -  0.5202980041503906


In [None]:
y_mul_train = df_mul_train['follows?'].values
y_mul_val = df_mul_val['follows?'].values
y_mul_test = df_mul_test['follows?'].values

In [None]:
#remove empty values for train
y_train_mul_2 = []
x_train_mul_tokenized_2 = []
x_train_mul_normal_2 = []
empty_indices_mul_train = []

for i in range(len(x_train_mul_tokenized)):
  if len(x_train_mul_tokenized[i])==0:
    empty_indices_mul_train.append(i)
  else:
    x_train_mul_tokenized_2.append(x_train_mul_tokenized[i])
    x_train_mul_normal_2.append(x_train_mul_normal[i])
    y_train_mul_2.append(y_mul_train[i])

In [None]:
#remove empty values for val
y_val_mul_2 = []
x_val_mul_tokenized_2 = []
x_val_mul_normal_2 = []
empty_indices_mul_val = []

for i in range(len(x_val_mul_tokenized)):
  if len(x_val_mul_tokenized[i])==0:
    empty_indices_mul_val.append(i)
  else:
    x_val_mul_tokenized_2.append(x_val_mul_tokenized[i])
    x_val_mul_normal_2.append(x_val_mul_normal[i])
    y_val_mul_2.append(y_mul_val[i])

In [None]:
#remove empty values for test
y_test_mul_2 = []
x_test_mul_tokenized_2 = []
x_test_mul_normal_2 = []
empty_indices_mul_test = []

for i in range(len(x_test_mul_tokenized)):
  if len(x_test_mul_tokenized[i])==0:
    empty_indices_mul_test.append(i)
  else:
    x_test_mul_tokenized_2.append(x_test_mul_tokenized[i])
    x_test_mul_normal_2.append(x_test_mul_normal[i])
    y_test_mul_2.append(y_mul_test[i])

In [None]:
print(empty_indices_mul_train)
print(len(empty_indices_mul_train))
print(len(x_train_mul_tokenized))
print(len(x_train_mul_tokenized_2))
print(len(x_train_mul_normal_2))
print(len(y_train_mul_2))

print(empty_indices_mul_val)
print(len(empty_indices_mul_val))
print(len(x_val_mul_tokenized))
print(len(x_val_mul_tokenized_2))
print(len(x_val_mul_normal_2))
print(len(y_val_mul_2))

print(empty_indices_mul_test)
print(len(empty_indices_mul_test))
print(len(x_test_mul_tokenized))
print(len(x_test_mul_tokenized_2))
print(len(x_test_mul_normal_2))
print(len(y_test_mul_2))

[]
0
15475
15475
15475
15475
[]
0
2229
2229
2229
2229
[]
0
4433
4433
4433
4433


In [None]:
# Saving the objects:
with open('/content/drive/MyDrive/2021_NLU/data/no_repeats/mul_emojis.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump([x_train_mul_tokenized, x_train_mul_normal, x_val_mul_tokenized, x_val_mul_normal, x_test_mul_tokenized, x_test_mul_normal, empty_indices_mul_train, x_train_mul_tokenized_2, \
                 x_train_mul_normal_2, empty_indices_mul_val, x_val_mul_tokenized_2, x_val_mul_normal_2, y_mul_train, y_mul_val, y_train_mul_2, y_val_mul_2, y_mul_test, y_test_mul_2,\
                 empty_indices_mul_test, x_test_mul_normal_2, x_test_mul_tokenized_2], f)

In [None]:
with open('/content/drive/MyDrive/2021_NLU/data/no_repeats/mul_emojis.pkl', 'rb') as f:  
    x_train_mul_tokenized, x_train_mul_normal, x_val_mul_tokenized, x_val_mul_normal, x_test_mul_tokenized, x_test_mul_normal, empty_indices_mul_train, x_train_mul_tokenized_2, \
                 x_train_mul_normal_2, empty_indices_mul_val, x_val_mul_tokenized_2, x_val_mul_normal_2, y_mul_train, y_mul_val, y_train_mul_2, y_val_mul_2, y_mul_test, y_test_mul_2,\
                 empty_indices_mul_test, x_test_mul_normal_2, x_test_mul_tokenized_2 = pickle.load(f)

In [None]:
X_train_mul_w2vec = convert_word2vec(word2vec, x_train_mul_tokenized_2, strategy='mean')
X_val_mul_w2vec = convert_word2vec(word2vec, x_val_mul_tokenized_2, strategy='mean')
X_test_mul_w2vec = convert_word2vec(word2vec, x_test_mul_tokenized_2, strategy='mean')

In [None]:
# list(df_train['emoji_sentence'][2])
df_mul_train['emoji_sentence_list'] = df_mul_train['emoji_sentence'].apply(lambda x: list(x))
emoji_corpus_mul_train = df_mul_train['emoji_sentence_list'].values

df_mul_val['emoji_sentence_list'] = df_mul_val['emoji_sentence'].apply(lambda x: list(x))
emoji_corpus_mul_val = df_mul_val['emoji_sentence_list'].values

df_mul_test['emoji_sentence_list'] = df_mul_test['emoji_sentence'].apply(lambda x: list(x))
emoji_corpus_mul_test = df_mul_test['emoji_sentence_list'].values

In [None]:
X_train_mul_e2vec = convert_emoji2vec(e2v, emoji_corpus_mul_train, strategy='mean')
X_val_mul_e2vec = convert_emoji2vec(e2v, emoji_corpus_mul_val, strategy='mean')
X_test_mul_e2vec = convert_emoji2vec(e2v, emoji_corpus_mul_test, strategy='mean')

In [None]:
print(len(X_train_mul_w2vec))
print(len(X_train_mul_w2vec[4]))

print(len(X_val_mul_w2vec))
print(len(X_val_mul_w2vec[4]))

print(len(X_test_mul_w2vec))
print(len(X_test_mul_w2vec[4]))

15475
300
2229
300
4433
300


In [None]:
print(len(X_train_mul_e2vec))
print(len(X_train_mul_e2vec[4]))

print(len(X_val_mul_e2vec))
print(len(X_val_mul_e2vec[4]))

print(len(X_test_mul_e2vec))
print(len(X_test_mul_e2vec[4]))

15475
300
2229
300
4433
300


#### Averaged Vectors

In [None]:
X_train_mul_vec = (np.array(X_train_mul_w2vec) + np.array(X_train_mul_e2vec))/2
X_val_mul_vec = (np.array(X_val_mul_w2vec) + np.array(X_val_mul_e2vec))/2
X_test_mul_vec = (np.array(X_test_mul_w2vec) + np.array(X_test_mul_e2vec))/2

In [None]:
# Saving the objects:
with open('/content/drive/MyDrive/2021_NLU/data/no_repeats/mul_emojis_vec.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump([X_train_mul_vec, X_val_mul_vec, X_test_mul_vec], f)

In [None]:
with open('/content/drive/MyDrive/2021_NLU/data/no_repeats/mul_emojis_vec.pkl', 'rb') as f:  
    X_train_mul_vec, X_val_mul_vec, X_test_mul_vec = pickle.load(f)

#### Concatenated Vectors

In [None]:
X_train_mul_vec_concat = np.concatenate((X_train_mul_w2vec, X_train_mul_e2vec), axis=1) 
X_val_mul_vec_concat  = np.concatenate((X_val_mul_w2vec, X_val_mul_e2vec), axis=1) 
X_test_mul_vec_concat  = np.concatenate((X_test_mul_w2vec, X_test_mul_e2vec), axis=1) 

In [None]:
# Saving the objects:
with open('/content/drive/MyDrive/2021_NLU/data/no_repeats/mul_emojis_vec.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump([X_train_mul_vec_concat, X_val_mul_vec_concat, X_test_mul_vec_concat], f)

In [None]:
with open('/content/drive/MyDrive/2021_NLU/data/no_repeats/mul_emojis_vec.pkl', 'rb') as f:  
    X_train_mul_vec_concat, X_val_mul_vec_concat, X_test_mul_vec_concat = pickle.load(f)