In [5]:
#NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

#other
import re
import string
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

#plot
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
import matplotlib as plt

In [6]:
df = pd.read_csv('/home/ashish/test/yelp.csv')

In [7]:
df.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [8]:
df =df.drop(['review_id','user_id','business_id','date','useful','funny','cool','type'],axis=1)

In [9]:
df.head()

Unnamed: 0,stars,text
0,5,My wife took me here on my birthday for breakf...
1,5,I have no idea why some people give bad review...
2,4,love the gyro plate. Rice is so good and I als...
3,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!..."
4,5,General Manager Scott Petello is a good egg!!!...


In [10]:
df=df.dropna()

In [11]:
df.head()

Unnamed: 0,stars,text
0,5,My wife took me here on my birthday for breakf...
1,5,I have no idea why some people give bad review...
2,4,love the gyro plate. Rice is so good and I als...
3,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!..."
4,5,General Manager Scott Petello is a good egg!!!...


In [12]:
df.describe()

Unnamed: 0,stars
count,10000.0
mean,3.7775
std,1.214636
min,1.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


In [13]:
labels = df['stars'].map(lambda x : 1 if int(x) > 3 else 0)

In [14]:
print(labels)

0       1
1       1
2       1
3       1
4       1
5       1
6       1
7       1
8       1
9       1
10      1
11      1
12      1
13      1
14      1
15      0
16      0
17      1
18      0
19      1
20      0
21      1
22      1
23      0
24      1
25      1
26      1
27      1
28      1
29      1
       ..
9970    1
9971    1
9972    1
9973    1
9974    0
9975    1
9976    0
9977    1
9978    1
9979    1
9980    1
9981    1
9982    1
9983    0
9984    0
9985    1
9986    1
9987    0
9988    1
9989    1
9990    1
9991    1
9992    1
9993    1
9994    1
9995    0
9996    1
9997    1
9998    0
9999    1
Name: stars, Length: 10000, dtype: int64


In [15]:
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)

    return text

In [16]:
df['text']=df['text'].map(lambda x:clean_text(x))

In [17]:
df.head(10)

Unnamed: 0,stars,text
0,5,wife took birthday breakfast excel weather per...
1,5,idea peopl give bad review place goe show you ...
2,4,love gyro plate rice good also dig candi select
3,5,rosi dakota love chaparr dog park ! ! ! conven...
4,5,general manag scott petello good egg ! ! ! det...
5,4,quiessenc is simpli put beauti full window ear...
6,5,drop drive here ate back next day more food go...
7,4,luckili travel far make connect flight this th...
8,4,definit come happi hour ! price amaz sake bomb...
9,5,nobuo show uniqu talent everyth menu care craf...


In [18]:
vocabulary_size = 20000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(df['text'])

sequences = tokenizer.texts_to_sequences(df['text'])
data = pad_sequences(sequences, maxlen=50)

In [19]:
print(data.shape)

(10000, 50)


In [20]:
#lstm model
model_lstm = Sequential()
model_lstm.add(Embedding(20000, 100, input_length=50))
model_lstm.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(1, activation='sigmoid'))
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [21]:
model_lstm.fit(data, np.array(labels), validation_split=0.3, epochs=3)

Train on 7000 samples, validate on 3000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fb722add748>

In [24]:
 def create_conv_model():
    model_conv = Sequential()
    model_conv.add(Embedding(vocabulary_size, 100, input_length=50))
    model_conv.add(Dropout(0.2))
    model_conv.add(Conv1D(64, 5, activation='relu'))
    model_conv.add(MaxPooling1D(pool_size=4))
    model_conv.add(LSTM(100))
    model_conv.add(Dense(1, activation='sigmoid'))
    model_conv.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model_conv

In [25]:
model_conv = create_conv_model()
model_conv.fit(data, np.array(labels), validation_split=0.3, epochs = 3)

Train on 7000 samples, validate on 3000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fb710fe3c18>

In [26]:
df_save = pd.DataFrame(data)
df_label = pd.DataFrame(np.array(labels))

In [27]:
result = pd.concat([df_save, df_label], axis = 1)

In [28]:
result.to_csv('train_dense_word_vectors.csv', index=False)

In [29]:
embeddings_index = dict()
f = open('/home/ashish/test/glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [30]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocabulary_size, 100))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [31]:
model_glove = Sequential()
model_glove.add(Embedding(vocabulary_size, 100, input_length=50, weights=[embedding_matrix], trainable=False))
model_glove.add(Dropout(0.2))
model_glove.add(Conv1D(64, 5, activation='relu'))
model_glove.add(MaxPooling1D(pool_size=4))
model_glove.add(LSTM(100))
model_glove.add(Dense(1, activation='sigmoid'))
model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [32]:
model_glove.fit(data, np.array(labels), validation_split=0.4, epochs = 3)

Train on 6000 samples, validate on 4000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fb710098588>

In [33]:
lstm_embds = model_lstm.layers[0].get_weights()[0]

In [34]:
conv_embds = model_conv.layers[0].get_weights()[0]

In [35]:
glove_emds = model_glove.layers[0].get_weights()[0]

In [36]:
word_list = []
for word, i in tokenizer.word_index.items():
    word_list.append(word)

In [37]:
def plot_words(data, start, stop, step):
    trace = go.Scatter(
        x = data[start:stop:step,0], 
        y = data[start:stop:step, 1],
        mode = 'markers',
        text= word_list[start:stop:step]
    )
    layout = dict(title= 't-SNE 1 vs t-SNE 2',
                  yaxis = dict(title='t-SNE 2'),
                  xaxis = dict(title='t-SNE 1'),
                  hovermode= 'closest')
    fig = dict(data = [trace], layout= layout)
    py.iplot(fig)

In [38]:
number_of_words = 2000
lstm_tsne_embds = TSNE(n_components=2).fit_transform(lstm_embds)

In [39]:
plot_words(lstm_tsne_embds, 0, number_of_words, 1) 

In [40]:
conv_tsne_embds = TSNE(n_components=2).fit_transform(conv_embds)

In [41]:
plot_words(conv_tsne_embds, 0, number_of_words, 1)

In [42]:
glove_tsne_embds = TSNE(n_components=2).fit_transform(glove_emds)

In [60]:
plot_words(glove_tsne_embds, 0, number_of_words, 1)

NameError: name 'glove_tsne_embds' is not defined

In [57]:
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ashish/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
df['tokenized'] = df.apply(lambda row : nltk.word_tokenize(row['text']), axis=1)

In [None]:
df.head()

In [None]:
model_w2v = Word2Vec(df['tokenized'], size=100)

In [None]:
X = model_w2v[model_w2v.wv.vocab]

In [None]:
from sklearn.decomposition import TruncatedSVD

In [None]:
tsvd = TruncatedSVD(n_components=5, n_iter=10)
result = tsvd.fit_transform(X)

In [None]:
result.shape

In [None]:
tsvd_word_list = []
words = list(model_w2v.wv.vocab)
for i, word in enumerate(words):
    tsvd_word_list.append(word)

trace = go.Scatter(
    x = result[0:number_of_words, 0], 
    y = result[0:number_of_words, 1],
    mode = 'markers',
    text= tsvd_word_list[0:number_of_words]
)

layout = dict(title= 'SVD 1 vs SVD 2',
              yaxis = dict(title='SVD 2'),
              xaxis = dict(title='SVD 1'),
              hovermode= 'closest')

fg = dict(data = [trace], layout= layout)
py.iplot(fig)