In [1]:
from nltk.tokenize.punkt import PunktSentenceTokenizer
document = """To Sherlock Holmes she is always the woman. I have
seldom heard him mention her under any other name. In his eyes she
eclipses and predominates the whole of her sex. It was not that he
felt any emotion akin to love for Irene Adler. All emotions, and that
one particularly, were abhorrent to his cold, precise but admirably
balanced mind. He was, I take it, the most perfect reasoning and
observing machine that the world has seen, but as a lover he would
have placed himself in a false position. He never spoke of the softer
passions, save with a gibe and a sneer. They were admirable things for
the observer-excellent for drawing the veil from men’s motives and
actions. But for the trained reasoner to admit such intrusions into
his own delicate and finely adjusted temperament was to introduce a
distracting factor which might throw a doubt upon all his mental
results. Grit in a sensitive instrument, or a crack in one of his own
high-power lenses, would not be more disturbing than a strong emotion
in a nature such as his. And yet there was but one woman to him, and
that woman was the late Irene Adler, of dubious and questionable
memory.
"""

In [5]:
document = ' '.join(document.strip().split('\n'))
 
sentence_tokenizer = PunktSentenceTokenizer()
sentences = sentence_tokenizer.tokenize(document)
sentences

['To Sherlock Holmes she is always the woman.',
 'I have seldom heard him mention her under any other name.',
 'In his eyes she eclipses and predominates the whole of her sex.',
 'It was not that he felt any emotion akin to love for Irene Adler.',
 'All emotions, and that one particularly, were abhorrent to his cold, precise but admirably balanced mind.',
 'He was, I take it, the most perfect reasoning and observing machine that the world has seen, but as a lover he would have placed himself in a false position.',
 'He never spoke of the softer passions, save with a gibe and a sneer.',
 'They were admirable things for the observer-excellent for drawing the veil from men’s motives and actions.',
 'But for the trained reasoner to admit such intrusions into his own delicate and finely adjusted temperament was to introduce a distracting factor which might throw a doubt upon all his mental results.',
 'Grit in a sensitive instrument, or a crack in one of his own high-power lenses, would not

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
c = CountVectorizer()
bow_array = c.fit_transform([sentences[0]])
bow_array.toarray()

array([[1, 1, 1, 1, 1, 1, 1, 1]], dtype=int64)

In [7]:
c = CountVectorizer() # on all sentences
bow_matrix = c.fit_transform(sentences)
bow_matrix

<11x127 sparse matrix of type '<class 'numpy.int64'>'
	with 183 stored elements in Compressed Sparse Row format>

In [11]:
# Now we have a matrix where the rows are sentences and the columns are words.
#We need to transform this into a graph relating the sentences to each other. 
#To do this, we'll first normalize our matrix using Scikit-learn's TfidfTransformer. 
from sklearn.feature_extraction.text import TfidfTransformer
normalized_matrix = TfidfTransformer().fit_transform(bow_matrix)
normalized_matrix

<11x127 sparse matrix of type '<class 'numpy.float64'>'
	with 183 stored elements in Compressed Sparse Row format>

In [13]:
similarity_graph = normalized_matrix * normalized_matrix.T  #sentence to sentence relation--dot product of each other
similarity_graph.toarray()  #11X11 similarity measure matrix

array([[ 1.        ,  0.        ,  0.13737879,  0.04767903,  0.04305016,
         0.04345599,  0.03330044,  0.05261648,  0.07798958,  0.        ,
         0.20047419],
       [ 0.        ,  1.        ,  0.0842143 ,  0.07819597,  0.        ,
         0.05171612,  0.        ,  0.        ,  0.        ,  0.        ,
         0.05807146],
       [ 0.13737879,  0.0842143 ,  1.        ,  0.        ,  0.07004069,
         0.09648614,  0.1069042 ,  0.06701793,  0.09437203,  0.20474295,
         0.1197599 ],
       [ 0.04767903,  0.07819597,  0.        ,  1.        ,  0.07558987,
         0.18678911,  0.05853972,  0.09249592,  0.10892262,  0.09110741,
         0.24159019],
       [ 0.04305016,  0.        ,  0.07004069,  0.07558987,  1.        ,
         0.07055583,  0.02370685,  0.07272032,  0.17253418,  0.08262451,
         0.17789849],
       [ 0.04345599,  0.05171612,  0.09648614,  0.18678911,  0.07055583,
         1.        ,  0.12952649,  0.06859301,  0.06837492,  0.13015945,
         0.154

In [14]:
#With a graph of sentences, we can use pagerank to score them.
#To do this, we'll use the pagerank function from NetworkX.
import networkx as nx
nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
scores = nx.pagerank(nx_graph)
scores


{0: 0.083850085970473526,
 1: 0.075281441703240984,
 2: 0.098385602908782135,
 3: 0.097442706690059666,
 4: 0.089265664661431293,
 5: 0.098256154950726426,
 6: 0.082611226442511093,
 7: 0.082718453245989129,
 8: 0.094336171635092891,
 9: 0.086368227893767843,
 10: 0.11148426389792518}

In [15]:
ranked = sorted(((scores[i],s) for i,s in enumerate(sentences)),
                reverse=True)
ranked[0][1]

'And yet there was but one woman to him, and that woman was the late Irene Adler, of dubious and questionable memory.'

In [16]:
ranked[:5]

[(0.11148426389792518,
  'And yet there was but one woman to him, and that woman was the late Irene Adler, of dubious and questionable memory.'),
 (0.098385602908782135,
  'In his eyes she eclipses and predominates the whole of her sex.'),
 (0.098256154950726426,
  'He was, I take it, the most perfect reasoning and observing machine that the world has seen, but as a lover he would have placed himself in a false position.'),
 (0.097442706690059666,
  'It was not that he felt any emotion akin to love for Irene Adler.'),
 (0.094336171635092891,
  'But for the trained reasoner to admit such intrusions into his own delicate and finely adjusted temperament was to introduce a distracting factor which might throw a doubt upon all his mental results.')]