### This notebook is the first part of building a recommendation system with the Sequential Neural Network.
The idea is to generate unique sentences to practice based on the user's error history. 

In [None]:
!pip install keras

In [None]:
!pip install tensorflow

In [None]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

# Drawing the embeddings
import matplotlib.pyplot as plt

# Deep learning:
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf

In [None]:
# We load custom functions from a python file
from extra.utility import text_preprocessing, create_unique_word_dict,euclidean,find_similar


In [None]:
# Reading the text from the input folder
texts = pd.read_csv('data/sentences_test.csv')
texts = [x for x in texts['text']]
texts

In [None]:
# Defining the window for context
window = 2

# Creating a placeholder for the scanning of the word list
word_lists = []
all_text = []

- We go through all the sentences in the dataset. Preprocess the sentences (tokenize and remove stop words)
- We then create pair of words which occur near to each other in a window size equal 2.
  - A pair is created for two words say `a` and `b` like `a,b` and `b,a`

In [None]:
for text in texts:

    # Cleaning the text
    text = text_preprocessing(text)

    # Appending to the all text list
    all_text += text

    # Creating a context dictionary
    for i, word in enumerate(text):
        for w in range(window):
            # Getting the context that is ahead by *window* words
            if i + 1 + w < len(text): 
                word_lists.append([word] + [text[(i + 1 + w)]])
            # Getting the context that is behind by *window* words    
            if i - w - 1 >= 0:
                word_lists.append([word] + [text[(i - w - 1)]])


In [None]:
unique_word_dict = create_unique_word_dict(all_text)

In [None]:
# Defining the number of features (unique words)
n_words = len(unique_word_dict)

# Getting all the unique words 
words = list(unique_word_dict.keys())

In [None]:
# Creating the X and Y matrices using one hot encoding
X = []
Y = []

for i, word_list in tqdm(enumerate(word_lists)):
    # Getting the indices
    main_word_index = unique_word_dict.get(word_list[0])
    context_word_index = unique_word_dict.get(word_list[1])

    # Creating the placeholders   
    X_row = np.zeros(n_words)
    Y_row = np.zeros(n_words)

    # One hot encoding the main word
    X_row[main_word_index] = 1

    # One hot encoding the Y matrix words 
    Y_row[context_word_index] = 1

    # Appending to the main matrices
    X.append(X_row)
    Y.append(Y_row)

In [None]:
# converting lists to Numpy arrays
X = np.asarray(X)
Y = np.asarray(Y)


In [None]:
Y.shape

In [None]:
# Defining the size of the embedding
embed_size = 2

We build a 1 layer Neural network, with an `input dimension = total number of unique words`, which represents our vocabulary. And the `output dimension = size of vocabulary` to get the embeddings for each word of same dimension.

In [None]:
# Defining the neural network
model = Sequential()

# layers
model.add(Dense(units = embed_size, activation = 'linear', input_dim = X.shape[1]))
model.add(Dense(units = Y.shape[1], activation = 'softmax'))


# Compiling the ANN
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy')
print(model.summary())

In [None]:

# Optimizing the network weights
model.fit(
    x=X, 
    y=Y, 
    batch_size=256,
    epochs=1000
    )


In [None]:
# The input layer 
weights = model.get_weights()[0]

In [None]:
# Creating a dictionary to store the embeddings in. The key is a unique word and 
# the value is the numeric vector
embedding_dict = {}
for word in words: 
    embedding_dict.update({
        word: weights[unique_word_dict.get(word)]
        })

In [None]:
# Plotting the embeddings
plt.figure(figsize=(10, 10))
for word in list(unique_word_dict.keys()):
    coord = embedding_dict.get(word)
    plt.scatter(coord[0], coord[1])
    plt.annotate(word, (coord[0], coord[1]))       


#### Visualization in 3D

We can also visualize the embeddings in 3-dimensions. For this change the `embed_size` to `3` by scrolling 5-6 cells above and running the cells again. Also Uncomment the code cell below to visualize embeddings in 3D

In [None]:
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
# Plotting the embeddings
plt.figure(figsize=(10, 10))
for word in list(unique_word_dict.keys()):
     coord = embedding_dict.get(word)
     ax.scatter(coord[0], coord[1],coord[2])
     ax.text(coord[0], coord[1], coord[2],word)    

In [None]:
euclidean(X[0],Y[34])

In [None]:
find_similar('abandon', embedding_dict, top_n = 5)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between two vectors
cosine_similarity(embedding_dict['success'].reshape(1, -1), embedding_dict['bad'].reshape(1, -1))


# Part 2. Embedding for A level words and sentences

In [None]:
# Reading the text from the input folder
texts_a = pd.read_csv('data/sentences_a.csv')
texts_a = [x for x in texts_a['Sentences']]
texts_a

In [None]:
# Reading the text from the input folder
words_a = pd.read_csv('data/unique_words_a.csv')
words_a = [x for x in words_a['words']]
words_a

In [None]:
# Creating a dictionary for sentences with indices as keys
sentences_a_dict = {index: value for index, value in enumerate(texts_a)}

In [None]:
# Defining the window for context
window = 2

# Creating a placeholder for the scanning of the word list
word_lists_a = []
all_sentences = []

In [None]:
for text in texts_a:

    # Cleaning the text
    text = text_preprocessing(text)

    # Appending to the all text list
    all_sentences += text

    # Creating a context dictionary
    for i, word in enumerate(text):
        for w in range(window):
            # Getting the context that is ahead by *window* words
            if i + 1 + w < len(text): 
                word_lists_a.append([word] + [text[(i + 1 + w)]])
            # Getting the context that is behind by *window* words    
            if i - w - 1 >= 0:
                word_lists_a.append([word] + [text[(i - w - 1)]])


In [None]:
word_lists_a

In [None]:
# Creating a dictionary for words with indices as keys
words_a_dict = {index: value for index, value in enumerate(words_a)}