In [11]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt') # one time execution
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adwivedi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
df = pd.read_csv(r"E:\work\Learning\Stanford\project\proposal\baseline\HamptonReviews.tsv", header=0, delimiter='\t', encoding='utf-8', keep_default_na=False)

In [6]:
print(df.head(4))

                                          ReviewText
0  Hotel is in the perfect spot at the perfect pr...
1  Excellent experience. Will come again and book...
2  heat in room did not work properly, tv remote ...
3  Even though we were having problems, i.e. Feat...


In [7]:
from nltk.tokenize import sent_tokenize
sentences = []
for s in df['ReviewText']:
  sentences.append(sent_tokenize(s))

sentences = [y for x in sentences for y in x] # flatten list

In [8]:
sentences[:5]

['Hotel is in the perfect spot at the perfect price with the not so perfect view.',
 'Expedia rep said that it was facing the water and in a way it was but at a 45 degree angel past the parking lot and a few other obstructions.',
 'You have to stretch your neck at the furthest corner of the window to gain a peak.',
 'But it was still good though.',
 "The room sleeps like nobody's business."]

In [12]:
# Extract word vectors
word_embeddings = {}
f = open(r"E:\work\Learning\Stanford\project\proposal\baseline\glove.6B\glove.6B.100d.txt", encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [13]:
len(word_embeddings)

400000

In [14]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]

In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adwivedi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [16]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [17]:
# function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [18]:
# remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

In [19]:
sentence_vectors = []
for i in clean_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vectors.append(v)

In [20]:
# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

In [23]:
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [24]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

In [27]:
for i in range(10):
  print(ranked_sentences[i][1])

The continental breakfast was great, but they should consider expanding their hours or at least allowing guests to finish the food that is out at 10, because every morning the food was taken away promptly at 10:00 even though the area was packed with guests still trying to get breakfast.
The hotel offers free breakfast that was really good ( not the usual crappy free breakfast) The only thing I didn't like about the room was that the air conditioner was regulated a motion sensor.
Due to a mix-up on my part with the online travel service it was set up as a king rather than two queen beds, for two guys.
The fee for parking in the community garage was free with room key and even included a convenient room key drop upon final exit.
I immediately called at the time and Hampton said they would make a note that a room with two queens was needed.
clean, good area, very kind crew, easy to park
When i got there, they didn't have a 2-queen room so they put me up in another king room (and my buddy