In [1]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt') # one time execution
import re

[nltk_data] Downloading package punkt to C:\Users\Alaa
[nltk_data]     Elsherif\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [36]:
df="In general terms, AI refers to computational tools that are able to substitute for human intelligence in the performance of certain tasks. This technology is currently advancing at a breakneck pace, much like the exponential growth experienced by database technology in the late twentieth century. Databases have grown to become the core infrastructure that drives enterprise-level software. Similarly, most of the new value added from software over the coming decades is expected to be driven, at least in part, by AI. Within the last decade, databases have evolved significantly in order to handle the new phenomenon dubbed “big data.” This refers to the unprecedented size and global scale of modern data sets, largely gathered from the computer systems that have come to mediate nearly every aspect of daily life. For instance, YouTube receives over 400 hours of video content each minute (Brouwer 2015)."

In [37]:
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(df)
print(sentences)
#sentences = [y for x in sentences for y in x] # flatten list

['In general terms, AI refers to computational tools that are able to substitute for human intelligence in the performance of certain tasks.', 'This technology is currently advancing at a breakneck pace, much like the exponential growth experienced by database technology in the late twentieth century.', 'Databases have grown to become the core infrastructure that drives enterprise-level software.', 'Similarly, most of the new value added from software over the coming decades is expected to be driven, at least in part, by AI.', 'Within the last decade, databases have evolved significantly in order to handle the new phenomenon dubbed “big data.” This refers to the unprecedented size and global scale of modern data sets, largely gathered from the computer systems that have come to mediate nearly every aspect of daily life.', 'For instance, YouTube receives over 400 hours of video content each minute (Brouwer 2015).']


In [38]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

'wget' is not recognized as an internal or external command,
operable program or batch file.
'unzip' is not recognized as an internal or external command,
operable program or batch file.


In [39]:
# Extract word vectors
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [40]:
len(word_embeddings)

400000

In [41]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]

In [42]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Alaa
[nltk_data]     Elsherif\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [43]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [44]:
# function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [45]:
# remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

In [46]:
sentence_vectors = []
for i in clean_sentences:
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
    else:
        v = np.zeros((100,))
    sentence_vectors.append(v)

In [47]:
# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])

In [48]:
from sklearn.metrics.pairwise import cosine_similarity


In [49]:
for i in range(len(sentences)):
    for j in range(len(sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

In [50]:
print(sim_mat[1][2])

0.8317020535469055


In [51]:
import networkx as nx
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [52]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

In [54]:
# Extract top 2 sentences as the summary
for i in range(2):
    print(ranked_sentences[i][1])

Within the last decade, databases have evolved significantly in order to handle the new phenomenon dubbed “big data.” This refers to the unprecedented size and global scale of modern data sets, largely gathered from the computer systems that have come to mediate nearly every aspect of daily life.
Similarly, most of the new value added from software over the coming decades is expected to be driven, at least in part, by AI.
