In [1]:
do_mount=False
if do_mount:
  from google.colab import drive
  drive.mount('/content/gdrive')

In [2]:
TRAIN_INPUT = 'twitgen_train_201906011956.csv'
EMBED_FILE_NAME = 'glove_train_embeddings.pkl.gz'

In [3]:
basepath = '../data/'
glovefile = 'glove.twitter.27B.200d.txt.gz'
glovepath = basepath + glovefile

In [4]:
if do_mount:
  # Get the embedding initialization file
  !cp '$glovepath' .
  !gunzip $glovefile
  !ls -l

In [5]:
import pandas as pd
import numpy as np
import string
import re

In [6]:
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    return text

In [7]:
def get_vocab(corpus):
    texts = corpus.map(lambda x: clean_text(x)).tolist()
    texts = [t.split() for t in texts]
    return(set(word for tweet in texts for word in tweet))    

In [8]:
df = pd.read_csv(basepath+TRAIN_INPUT, usecols=['text'])

In [9]:
vocab = get_vocab(df.text)

In [10]:
# Load the whole embedding into memory
embeddings_index = dict()
f = open(glovefile[:-3])
for line in f:
    values = line.split()
    word = values[0]
    if word in vocab:
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 25458 word vectors.


In [11]:
pd.DataFrame.from_dict(embeddings_index, orient='index').to_pickle(basepath + EMBED_FILE_NAME)