In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


In [2]:
def text_preprocession(Corpus):
    TEXT = Corpus.iloc[:,1]
    # Step - a : Remove blank rows if any.
    TEXT.dropna(inplace=True)
    # Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
    TEXT = [entry.lower() for entry in TEXT]
    # Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
    TEXT = [word_tokenize(entry) for entry in TEXT]
    CorpusList = []
    # for entry in enumerate(TEXT):
    for entry in TEXT:
        # Declaring Empty List to store the words that follow the rules for this step
        Final_words = ''
        for word in entry:
            if word not in stopwords.words('english'):
                Final_words = Final_words + word + ' '
        CorpusList.append(Final_words)
    return CorpusList

In [3]:
dataset = pd.read_csv('data/Bai_news_headlines.csv')
news_list = text_preprocession(dataset)

In [4]:
news_list[0]

'precious-gold regains footing equities , libya ; cenbanks eyed '

In [5]:
MAX_NUM_WORDS = 2000
MAX_SEQUENCE_LENGTH = 50
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(news_list)
sequences = tokenizer.texts_to_sequences(news_list)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

new_data = []
for l in data:
    l = list(l)
    while True:
        l.remove(0)
        if 0 not in l:
            break
    new_data.append(l)

Found 8856 unique tokens.


In [6]:
with open('word_embedding_data.txt','w',encoding='utf-8') as f:
    for i in range(len(news_list)):
        f.write(news_list[i]+'\n')
        
with open('word_embedding_doc_term_mat.txt','w',encoding='utf-8') as f:
    for i in range(len(new_data)):
        l = str(new_data[i]).strip('[')
        r = l.strip(']')
        f.write(r+'\n')


with open('word_embedding_vocab.txt','w',encoding='utf-8') as f:
    for key in word_index:       
        f.write(f"{key} {word_index[key]}\n")

In [17]:
d1={"1":1,"2":2,"3":3}
d2={"2":2,"3":1}
{**d1,**d2}

{'1': 1, '2': 2, '3': 1}

In [18]:
embeddings_index = {}
f = open('data/glove.6B.50d.txt',encoding = 'utf-8')
for line in f:
    values = line.split(' ')
    word = values[0] ## The first entry is the word
    coefs = np.asarray(values[1:], dtype='float32') ## These are the vecotrs representing the embedding for the word
    embeddings_index[word] = coefs
f.close()

In [28]:
embeddings_index["the"]

array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
      dtype=float32)

In [25]:
word_index["the"]

4023

In [44]:
print('create vocab')
vocab = {}
fp = open("word_embedding_data.txt", 'r',encoding='utf-8')
for line in fp:
    arr = re.split('\s', line[:-1])
    for wd in arr:
        try:
            vocab[wd] += 1
        except:
            vocab[wd] = 1
fp.close()

create vocab


In [48]:
vocab_arr = [[wd, vocab[wd]] for wd in vocab if vocab[wd] >3]
vocab_arr = sorted(vocab_arr, key=lambda k: k[1])[::-1]
vocab_arr = vocab_arr[:2000]
vocab_arr = sorted(vocab_arr)

In [50]:
fout = open("vocab.txt", 'w')
for itm in vocab_arr:
    itm[1] = str(itm[1])
    fout.write(' '.join(itm)+'\n')
fout.close()

In [51]:
vocab_arr[0]

['', '28220']

In [53]:
# vocabulary to id
vocab2id = {itm[1][0]: itm[0] for itm in enumerate(vocab_arr)}
print('create document term matrix')
data_arr = []
fp = open("word_embedding_data.txt", 'r',encoding='utf-8')
fout = open("wedoc_term_mat.txt", 'w')
for line in fp:
    arr = re.split('\s', line[:-1])
    arr = [str(vocab2id[wd]) for wd in arr if wd in vocab2id]
    sen = ' '.join(arr)
    fout.write(sen+'\n')
fp.close()
fout.close()


create document term matrix
