In [2]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.models import KeyedVectors



In [3]:
df_short = pd.read_csv("data/processed/processed_short.csv")
df_medium = pd.read_csv("data/processed/processed_medium.csv")
df_dank = pd.read_csv("data/processed/processed_dank.csv")

In [4]:
df_joint = pd.concat([df_short, df_medium, df_dank])
df_joint = df_joint.reset_index(drop=True)
df_train = df_joint.sample(frac=1).reset_index(drop=True)
df_train

Unnamed: 0,text,label
0,thinl role like sachin standing others hit,0.0
1,think waste rr,0.0
2,movie blaxploitation absolutely plot pimp sto...,1.0
3,saw movie indian friends christmas day quick ...,0.0
4,april good covid vaccin china expect readi cli...,0.0
...,...,...
15559,want come online,0.0
15560,irloth employ stay home work construct irl,0.0
15561,france around march love go film festivals kn...,1.0
15562,movie great stars earlier years ingor stevens...,1.0


In [5]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Embedding
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)

In [6]:
def glove_100d(df):
    text = np.asarray(df['text'])
    y = df['label']
    vectorizer.adapt(text)

    voc = vectorizer.get_vocabulary()
    word_index = dict(zip(voc, range(len(voc))))
    embeddings_index = {}
    with open('data/raw/glove.6B.100d.txt',encoding="utf8") as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs
    print("Found %s word vectors." % len(embeddings_index))

    num_tokens = len(voc) + 2
    print('num_tokes' + str(num_tokens))
    embedding_dim = 100
    hits = 0
    misses = 0

    # Prepare embedding matrix
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            # This includes the representation for "padding" and "OOV"
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1
    print("Converted %d words (%d misses)" % (hits, misses))
    

    return embedding_matrix

In [7]:
universal = glove_100d(df_train)


Found 400000 word vectors.
num_tokes20002
Converted 18809 words (1191 misses)


In [8]:
type(universal)

numpy.ndarray

In [9]:
np.save('embed_matrix', universal)

# Generating and saving vectors

In [15]:
def generate_vectors(df):
    name =[x for x in globals() if globals()[x] is df][0]
    df['list'] = df["text"].str.split()
    documents = df["list"].to_numpy()
    skipgram = Word2Vec(sentences=documents, vector_size=100, window=5, sg=1)
    word_vectors = skipgram.wv
    word_vectors.save("data/" + str(name) + ".wordvectors")

In [16]:
generate_vectors(df_short)

In [17]:
generate_vectors(df_medium)

In [18]:
generate_vectors(df_dank)

# How to load them:

In [19]:
wv = KeyedVectors.load("data/df_dank.wordvectors", mmap='r')

# legacy example of diy embed layer

In [20]:
def create_embedding_layer(df):
    name =[x for x in globals() if globals()[x] is df][0]
    #text = df['text'].to_numpy()
    text = np.asarray(df['text'])
    y = df['label']
    vectorizer.adapt(text)

    voc = vectorizer.get_vocabulary()
    word_index = dict(zip(voc, range(len(voc))))
    print("data/" + str(name) + ".wordvectors")
    wv = KeyedVectors.load("data/" + str(name) + ".wordvectors", mmap='r')
    embeddings_index= {}
    for word in voc:
        if wv.has_index_for(word):
            embeddings_index[word] = wv[word]
            
        else:
            embeddings_index[word] = np.zeros(100)
    
    num_tokens = len(voc) + 2
    embedding_dim = 100
    hits = 0
    misses = 0
    
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1
    print("Converted %d words (%d misses)" % (hits, misses))

    embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,)

    return embedding_layer