In [1]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt') # one time execution
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
sentence = []
sentence.append('great food and crunchy taste')
sentence.append('food was delicious and would recommend this to everyone')
sentence.append('great food was delicious and would recommend this to everyone')
sentence.append('great food and would recommend this to everyone')
sentence.append('food and crunchy taste')
sentence.append('food was delicious and crunchy taste')

In [1]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

--2019-08-20 05:29:24--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2019-08-20 05:29:30--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2019-08-20 05:29:30--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2019-0

In [0]:
# Extract word vectors
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [6]:
len(word_embeddings)

400000

In [0]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentence).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [0]:
# function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [0]:
# remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

In [0]:
# Extract word vectors
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [0]:
sentence_vectors = []
for i in clean_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vectors.append(v)

In [0]:
# similarity matrix
sim_mat = np.zeros([len(sentence), len(sentence)])

In [0]:
from sklearn.metrics.pairwise import cosine_similarity

In [0]:
for i in range(len(sentence)):
  for j in range(len(sentence)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

In [0]:
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [0]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentence)), reverse=True)

In [41]:
cosine_similarity(sentence_vectors[0].reshape(1,100), sentence_vectors[1].reshape(1,100))[0,0], sim_mat

(0.8138783, array([[0.        , 0.8138783 , 0.84460819, 0.76167661, 0.94806373,
         0.9319489 ],
        [0.8138783 , 0.        , 0.98153341, 0.95419735, 0.7524004 ,
         0.73831457],
        [0.84460819, 0.98153341, 0.        , 0.97895247, 0.72885811,
         0.7157914 ],
        [0.76167661, 0.95419735, 0.97895247, 0.        , 0.62252635,
         0.58228189],
        [0.94806373, 0.7524004 , 0.72885811, 0.62252635, 0.        ,
         0.982256  ],
        [0.9319489 , 0.73831457, 0.7157914 , 0.58228189, 0.982256  ,
         0.        ]]))

In [45]:
ranked_sentences

[(0.1733333707241193, 'great food and crunchy taste'),
 (0.17152750303169678,
  'great food was delicious and would recommend this to everyone'),
 (0.17119901942264934,
  'food was delicious and would recommend this to everyone'),
 (0.16390921893112884, 'food and crunchy taste'),
 (0.16094013443409155, 'food was delicious and crunchy taste'),
 (0.159090753456314, 'great food and would recommend this to everyone')]

In [44]:
# Extract top 10 sentences as the summary
for i in range(6):
  print(ranked_sentences[i][1])

great food and crunchy taste
great food was delicious and would recommend this to everyone
food was delicious and would recommend this to everyone
food and crunchy taste
food was delicious and crunchy taste
great food and would recommend this to everyone
