Skip to content

Commit

Permalink
Use spacy to segment sentences
Browse files Browse the repository at this point in the history
  • Loading branch information
ceshine committed Apr 25, 2019
1 parent 9ceefca commit 9da9172
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 28 deletions.
1 change: 1 addition & 0 deletions Dockerfile.cpu
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ COPY . /src
WORKDIR /src
RUN pip install -U pip && rm -rf ~/.cache/pip
RUN pip install -r requirements.txt && rm -rf ~/.cache/pip
RUN python -m spacy download en_core_web_sm

EXPOSE 8000

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
https://github.com/ceshine/textrank/archive/20190425.zip
spacy==2.0.18
opencc-python-reimplemented
starlette==0.11.4
jinja2
Expand Down
15 changes: 8 additions & 7 deletions summa_score_sentences.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
"""Using similarity function from the original TextRank algorithm."""
from langdetect import detect
from summa.pagerank_weighted import pagerank_weighted_scipy as _pagerank
from summa.preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
# from summa.preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
from text_cleaning_en import clean_text_by_sentences as en_clean_text_by_sentences
from summa.commons import build_graph as _build_graph
from summa.commons import remove_unreachable_nodes as _remove_unreachable_nodes
from summa.summarizer import _set_graph_edge_weights, _add_scores_to_sentences
Expand Down Expand Up @@ -31,8 +32,10 @@ def summarize(text, additional_stopwords=None):
for i, paragraph in enumerate(paragraphs):
# Gets a list of processed sentences.
if paragraph:
tmp = _clean_text_by_sentences(
paragraph, "english", additional_stopwords)
tmp = en_clean_text_by_sentences(
paragraph, additional_stopwords)
# tmp = _clean_text_by_sentences(
# paragraph, "english")
for sent in tmp:
sent.paragraph = i
sentences += tmp
Expand All @@ -47,10 +50,9 @@ def summarize(text, additional_stopwords=None):
else:
return ["Language not suppored! (supported languages: en, zh, ja)"], None, lang

# print([sentence.token for sentence in sentences if sentence.token])
# Creates the graph and calculates the similarity coefficient for every pair of nodes.
graph = _build_graph(
[sentence.token for sentence in sentences if sentence.token])
[sentence.token for sentence in sentences if sentence.token and len(sentence.token) > 2])
_set_graph_edge_weights(graph)

# Remove all nodes with all edges weights equal to zero.
Expand All @@ -68,7 +70,6 @@ def summarize(text, additional_stopwords=None):

# Sorts the sentences
sentences.sort(key=lambda s: s.score, reverse=True)

return sentences, graph, lang


Expand All @@ -82,4 +83,4 @@ def summarize(text, additional_stopwords=None):
Cohen, has undertaken perhaps the most surprising and risky legal strategy.""")
assert lang == "en"
for row in res:
print(row)
print(f"{row.score} {row.text}")
46 changes: 25 additions & 21 deletions summa_score_sentences_use.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import tensorflow_hub as hub
from langdetect import detect
from summa.pagerank_weighted import pagerank_weighted_scipy as _pagerank
from summa.preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
from text_cleaning_en import clean_text_by_sentences as _clean_text_by_sentences
from summa.commons import build_graph as _build_graph
from summa.commons import remove_unreachable_nodes as _remove_unreachable_nodes
from summa.summarizer import _set_graph_edge_weights, _add_scores_to_sentences
Expand Down Expand Up @@ -60,32 +60,35 @@ def cosine_similarity(similarity_matrix, id_1, id_2):

def attach_setence_embeddings(sentences, model_name, batch_size=32):
model = get_model(model_name)
# remove extremely short sentences
sentence = [x for x in sentences if len(x.text) > 5]
sentence_embeddings = []
# don't use extremely short sentences
sentences_subset = [x for x in sentences if len(x.text) > 5]
sentence_embeddings_tmp = []
with tf.Session() as session:
session.run([tf.global_variables_initializer(),
tf.tables_initializer()])
for i in range(0, len(sentences), batch_size):
sentence_embeddings.append(
for i in range(0, len(sentences_subset), batch_size):
sentence_embeddings_tmp.append(
session.run(
model["sentence_emb"],
feed_dict={
model["sentence_input"]: [
x.text for x in sentences[i:(i+batch_size)]
x.text for x in sentences_subset[i:(i+batch_size)]
]
}
)
)
sentence_embeddings = np.concatenate(sentence_embeddings, axis=0)
# A rather hacky way to attach embeddings and replace token property
for i, sentence in enumerate(sentences):
sentence.embeddings = sentence_embeddings[i]
sentence.token = i
sentence_embeddings_subset = np.concatenate(
sentence_embeddings_tmp, axis=0)
sentence_embeddings = np.zeros(
(len(sentences), sentence_embeddings_subset.shape[1]), dtype="float32")
# A rather hacky way to attach embeddings
for i, sentence in enumerate(sentences_subset):
sentence_embeddings[sentence.token, :] = (
sentence_embeddings_subset[i, :])
similarities = sentence_embeddings @ sentence_embeddings.T
# print(similarities[np.tril_indices(similarities.shape[0], k=-1)])
# print(np.where(np.tril(similarities, k=-1) > 0.95))
return sentences, similarities
return similarities


def summarize(text, model_name="large", additional_stopwords=None):
Expand All @@ -99,9 +102,11 @@ def summarize(text, model_name="large", additional_stopwords=None):
# Gets a list of processed sentences.
if paragraph:
tmp = _clean_text_by_sentences(
paragraph, "english", additional_stopwords)
for sent in tmp:
paragraph, additional_stopwords)
for j, sent in enumerate(tmp):
sent.paragraph = i
# Hacky way to overwrite token
sent.token = len(sentences) + j
sentences += tmp
elif lang == "zh" or lang == "ko": # zh-Hant sometimes got misclassified into ko
raise NotImplementedError("Not supported yet: zh.")
Expand All @@ -112,9 +117,9 @@ def summarize(text, model_name="large", additional_stopwords=None):

# print([sentence.token for sentence in sentences if sentence.token])
# Creates the graph and calculates the similarity coefficient for every pair of nodes.
sentences, similarities = attach_setence_embeddings(
similarities = attach_setence_embeddings(
sentences, batch_size=32, model_name=model_name)
graph = _build_graph(list(range(len(sentences))))
graph = _build_graph([x.token for x in sentences])
_set_graph_edge_weights(graph, partial(cosine_similarity, similarities))

# Remove all nodes with all edges weights equal to zero.
Expand All @@ -136,11 +141,10 @@ def summarize(text, model_name="large", additional_stopwords=None):


if __name__ == "__main__":
res, _, lang = summarize("""Of all of President Trump’s former associates who have come under scrutiny in the special counsel’s Russia investigation, his former personal lawyer, Michael D. Cohen, has undertaken perhaps the most surprising and risky legal strategy.
res, _, lang = summarize("""
By Wednesday morning, he was trying to marry those two thoughts into a single message — both embracing the report and trashing it. “The Mueller Report, despite being written by Angry Democrats and Trump Haters, and with unlimited money behind it ($35,000,000), didn’t lay a glove on me,” he wrote. “I DID NOTHING WRONG.”
Mr. Cohen has twice pleaded guilty in federal court in Manhattan to a litany of crimes, and he has volunteered information to the special counsel and other agencies investigating Mr. Trump and his inner circle. He did all this without first obtaining a traditional, ironclad deal under which the government would commit to seeking leniency on Mr. Cohen’s behalf when he is sentenced on Dec. 12.
Mr. Cohen has concluded that his life has been utterly destroyed by his relationship with Mr. Trump and his own actions, and to begin anew he needed to speed up the legal process by quickly confessing his crimes and serving any sentence he receives, according to his friends and associates, and analysis of documents in the case.""")
In subsequent tweets, he tried again to claim victory amid his victimhood, casting the investigation as a contest in which he prevailed. In terms rarely used regarding a criminal investigation, he asserted that “We waited for Mueller and WON” and denounced “the Witch Hunt, which I have already won.”""")
assert lang == "en"
for row in res:
print(f"{row.score} {row.text}")
19 changes: 19 additions & 0 deletions text_cleaning_en.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import spacy

from summa.preprocessing.textcleaner import init_textcleanner, filter_words, merge_syntactic_units

NLP = spacy.load("en_core_web_sm")


def split_sentences(text):
doc = NLP(text)
return [sent.text.strip() for sent in doc.sents]


def clean_text_by_sentences(text, additional_stopwords=None):
"""Tokenizes a given text into sentences, applying filters and lemmatizing them.
Returns a SyntacticUnit list. """
init_textcleanner("english", additional_stopwords)
original_sentences = split_sentences(text)
filtered_sentences = filter_words(original_sentences)
return merge_syntactic_units(original_sentences, filtered_sentences)

0 comments on commit 9da9172

Please sign in to comment.