## This notebook works on Word Embeddings using Keras (from scratch, fine tune learned embeddings, look up embeddings)

### Import the libraries and define the arguments

In [5]:
# import the necessary libraries
from keras.layers.core import Dense, Activation, Dropout, SpatialDropout1D 
from keras.layers.convolutional import Conv1D
from keras.layers.embeddings import Embedding
from keras.layers.pooling import GlobalMaxPooling1D
from keras.models import Sequential
from keras.utils import np_utils
from keras.preprocessing.sequence import pad_sequences
import keras.backend as K

import nltk
import collections
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split

np.random.seed(42)

INPUT_FILE = "/Users/tkmacl9/Desktop/FastAIDLCourse/umich_sentiment_dataset/train.txt"
VOCAB_SIZE = 5000
EMBEDDING_SIZE = 100
NUM_FILTERS=256
NUM_WORDS = 3
BATCH_SIZE = 64
NUM_EPOCHS = 20

### Learn Embeddings from Scratch

In [28]:
# create our vocablary of most frequent words from the corpus
counter = collections.Counter()
fin = open(INPUT_FILE, 'r')
maxlen = 0
i = 0
for line in fin:
    sent = line.strip().split("\t")[1]
    words = [x.lower() for x in nltk.word_tokenize(sent)]
    if len(words) > maxlen:
        maxlen = len(words)
    for word in words:
        counter[word] += 1
fin.close()

# create the word indexes and the reverse indexes for the words
word2index = collections.defaultdict(int)
for wid, word in enumerate(counter.most_common(VOCAB_SIZE)):
    word2index[word[0]] = wid +1
vocab_size = len(word2index) + 1
index2word = {v:k for k,v in word2index.items()}
print(word2index)
print("")
print(index2word)

defaultdict(<class 'int'>, {'disliked': 541, 'they': 319, 'page': 648, '``': 35, 'groaning': 274, 'before': 367, 'mindless': 970, 'goes': 971, 'crap': 449, 'tournament..': 972, 'homosexuality': 227, 'was': 18, 'so': 32, 'aniwae': 973, 'impressive': 974, 'otp': 975, 'education': 649, 'chinese': 650, 'playing': 535, 'â€': 651, 'chick': 976, 'during': 539, 'hermione': 977, 'possibly': 978, 'correct': 1591, 'all': 310, 'adore': 979, 'few': 448, 'first': 114, "was'harry": 981, 'find': 983, 'problem': 654, 'half': 484, 'around': 183, 'weeeellllllll': 1088, 'im': 450, 'talking': 398, 'vigor': 228, 'criticized': 987, 'touching': 988, 'industry': 681, 'suck': 51, 'must': 451, 'well': 127, 'jackson': 1049, 'erm': 536, 'didnt': 655, '2': 117, 'visually': 993, 'literary': 994, 'anyways': 537, 'mang': 995, 'xd': 996, 'give': 452, 'german..': 998, 'reads': 999, 'bolsters': 1000, 'iii-sucks': 1001, 'lit': 538, 'exhausted': 1002, 'lines': 1003, 'diana': 1005, 'future': 656, 'heard': 133, 'indoctrinate

### Now pad the sentences to the maxlength and create the dataset for splitting