<img src="../Pics/MLSb-T.png" width="160">
<br><br>
<center><u><H1>GloVe-Yelp-Comments-Classification</H1></u></center>

In [None]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.log_device_placement = True
sess = tf.Session(config=config)
set_session(sess)

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, ModelCheckpoint
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
import string
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.manifold import TSNE
%matplotlib inline

## Load the data:

In [None]:
df = pd.read_csv('../data/yelp.csv')
df.head()

In [None]:
df= df.dropna()
df=df[['text','stars']]
df.head()

In [None]:
labels = df['stars'].map(lambda x : 1 if int(x) > 3 else 0)
print(labels[10:20])

In [None]:
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)

    return text

In [None]:
df['text'] = df['text'].map(lambda x: clean_text(x))

In [None]:
df.head(10)

In [None]:
maxlen = 50
embed_dim = 100
max_words = 20000

In [None]:
tokenizer = Tokenizer(num_words=max_words)

In [None]:
tokenizer.fit_on_texts(df['text'])

In [None]:
sequences = tokenizer.texts_to_sequences(df['text'])

In [None]:
data = pad_sequences(sequences, maxlen=maxlen, padding='post')
data[0]

In [None]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

In [None]:
labels = np.asarray(labels)

In [None]:
print('Shape of data:', data.shape)
print('Shape of label:', labels.shape)

## Creating datasets:

In [None]:
validation_split = .2

In [None]:
indices = np.arange(data.shape[0])

In [None]:
np.random.shuffle(indices)

In [None]:
data = data[indices]
labels = labels[indices]

In [None]:
val_samples = int(validation_split * data.shape[0])

In [None]:
X_train = data[:-val_samples]
y_train = labels[:-val_samples]
x_val = data[-val_samples:]
y_val = labels[-val_samples:]

## Load the GloVe embeddings

In [None]:
dir = '../data/GloVe/glove.6B'

In [None]:
embed_index = dict()
f = open(os.path.join(dir, 'glove.6B.100d.txt'), encoding="utf8")

In [None]:
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embed_index[word] = coefs
f.close()

In [None]:
print('%s Word vectors' % len(embed_index))

## Create a weight matrix:

In [None]:
embed_matrix = np.zeros((max_words, embed_dim))

for word, i in tokenizer.word_index.items():
    if i < max_words:
        embed_vector = embed_index.get(word)
        if embed_vector is not None:
            embed_matrix[i] = embed_vector

## Creating the model:

In [None]:
model = Sequential()

In [None]:
model.add(Embedding(max_words,
                    embed_dim,
                    weights=[embed_matrix],
                    input_length=maxlen))

In [None]:
model.add(Flatten())

In [None]:
model.add(Dropout(0.5))

In [None]:
model.add(Dense(32, activation='relu'))

In [None]:
model.add(Dropout(0.5))

In [None]:
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')

In [None]:
save_best = ModelCheckpoint('../data/yelp_comments.hdf', save_best_only=True, 
                               monitor='val_loss', mode='min')

In [None]:
%%time
model.fit(X_train, y_train,
          epochs=20,
          validation_data=(x_val, y_val),
          batch_size=128,
          verbose=1,
          callbacks=[early_stopping, save_best])

## Making predictions:

In [None]:
model.load_weights(filepath = '../data/yelp_comments.hdf')

In [None]:
pred = model.predict(x_val)

## Word embeddings visualization:

In [None]:
glove_embds = model.layers[0].get_weights()[0]

In [None]:
words = []
for word, i in tokenizer.word_index.items():
    words.append(word)

## Visualizing words:

In [None]:
def plot_words(data, start, stop, step):
    trace = go.Scatter(
        x = data[start:stop:step,0], 
        y = data[start:stop:step, 1],
        mode = 'markers',
        text= words[start:stop:step]
    )
    layout = dict(title= 't-SNE_factor1 vs t-SNE_factor2',
                  yaxis = dict(title='t-SNE_factor2'),
                  xaxis = dict(title='t-SNE_factor1'),
                  hovermode= 'closest')
    fig = dict(data = [trace], layout= layout)
    py.iplot(fig)

In [None]:
%%time
glove_tsne_embds = TSNE(n_components=2).fit_transform(glove_embds)
plot_words(glove_tsne_embds, 0, 100, 1)

## Reference:

https://nlp.stanford.edu/projects/glove/