Use spacy to segment sentences

ceshine · Apr 25, 2019 · 9da9172 · 9da9172
1 parent 9ceefca
commit 9da9172
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 28 deletions.
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
@@ -3,6 +3,7 @@ COPY . /src
 WORKDIR /src
 RUN pip install -U pip && rm -rf ~/.cache/pip
 RUN pip install -r requirements.txt && rm -rf ~/.cache/pip
+RUN python -m spacy download en_core_web_sm
 
 EXPOSE 8000
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
 https://github.com/ceshine/textrank/archive/20190425.zip
+spacy==2.0.18
 opencc-python-reimplemented
 starlette==0.11.4
 jinja2

diff --git a/summa_score_sentences.py b/summa_score_sentences.py
@@ -1,7 +1,8 @@
 """Using similarity function from the original TextRank algorithm."""
 from langdetect import detect
 from summa.pagerank_weighted import pagerank_weighted_scipy as _pagerank
-from summa.preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
+# from summa.preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
+from text_cleaning_en import clean_text_by_sentences as en_clean_text_by_sentences
 from summa.commons import build_graph as _build_graph
 from summa.commons import remove_unreachable_nodes as _remove_unreachable_nodes
 from summa.summarizer import _set_graph_edge_weights, _add_scores_to_sentences
@@ -31,8 +32,10 @@ def summarize(text, additional_stopwords=None):
         for i, paragraph in enumerate(paragraphs):
             # Gets a list of processed sentences.
             if paragraph:
-                tmp = _clean_text_by_sentences(
-                    paragraph, "english", additional_stopwords)
+                tmp = en_clean_text_by_sentences(
+                    paragraph, additional_stopwords)
+                # tmp = _clean_text_by_sentences(
+                #     paragraph, "english")
                 for sent in tmp:
                     sent.paragraph = i
                 sentences += tmp
@@ -47,10 +50,9 @@ def summarize(text, additional_stopwords=None):
     else:
         return ["Language not suppored! (supported languages: en, zh, ja)"], None, lang
 
-    # print([sentence.token for sentence in sentences if sentence.token])
     # Creates the graph and calculates the similarity coefficient for every pair of nodes.
     graph = _build_graph(
-        [sentence.token for sentence in sentences if sentence.token])
+        [sentence.token for sentence in sentences if sentence.token and len(sentence.token) > 2])
     _set_graph_edge_weights(graph)
 
     # Remove all nodes with all edges weights equal to zero.
@@ -68,7 +70,6 @@ def summarize(text, additional_stopwords=None):
 
     # Sorts the sentences
     sentences.sort(key=lambda s: s.score, reverse=True)
-
     return sentences, graph, lang
 
 
@@ -82,4 +83,4 @@ def summarize(text, additional_stopwords=None):
 Cohen, has undertaken perhaps the most surprising and risky legal strategy.""")
     assert lang == "en"
     for row in res:
-        print(row)
+        print(f"{row.score} {row.text}")
diff --git a/summa_score_sentences_use.py b/summa_score_sentences_use.py
@@ -8,7 +8,7 @@
 import tensorflow_hub as hub
 from langdetect import detect
 from summa.pagerank_weighted import pagerank_weighted_scipy as _pagerank
-from summa.preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
+from text_cleaning_en import clean_text_by_sentences as _clean_text_by_sentences
 from summa.commons import build_graph as _build_graph
 from summa.commons import remove_unreachable_nodes as _remove_unreachable_nodes
 from summa.summarizer import _set_graph_edge_weights, _add_scores_to_sentences
@@ -60,32 +60,35 @@ def cosine_similarity(similarity_matrix, id_1, id_2):
 
 def attach_setence_embeddings(sentences, model_name, batch_size=32):
     model = get_model(model_name)
-    # remove extremely short sentences
-    sentence = [x for x in sentences if len(x.text) > 5]
-    sentence_embeddings = []
+    # don't use extremely short sentences
+    sentences_subset = [x for x in sentences if len(x.text) > 5]
+    sentence_embeddings_tmp = []
     with tf.Session() as session:
         session.run([tf.global_variables_initializer(),
                      tf.tables_initializer()])
-        for i in range(0, len(sentences), batch_size):
-            sentence_embeddings.append(
+        for i in range(0, len(sentences_subset), batch_size):
+            sentence_embeddings_tmp.append(
                 session.run(
                     model["sentence_emb"],
                     feed_dict={
                         model["sentence_input"]: [
-                            x.text for x in sentences[i:(i+batch_size)]
+                            x.text for x in sentences_subset[i:(i+batch_size)]
                         ]
                     }
                 )
             )
-    sentence_embeddings = np.concatenate(sentence_embeddings, axis=0)
-    # A rather hacky way to attach embeddings and replace token property
-    for i, sentence in enumerate(sentences):
-        sentence.embeddings = sentence_embeddings[i]
-        sentence.token = i
+    sentence_embeddings_subset = np.concatenate(
+        sentence_embeddings_tmp, axis=0)
+    sentence_embeddings = np.zeros(
+        (len(sentences), sentence_embeddings_subset.shape[1]), dtype="float32")
+    # A rather hacky way to attach embeddings
+    for i, sentence in enumerate(sentences_subset):
+        sentence_embeddings[sentence.token, :] = (
+            sentence_embeddings_subset[i, :])
     similarities = sentence_embeddings @ sentence_embeddings.T
     # print(similarities[np.tril_indices(similarities.shape[0], k=-1)])
     # print(np.where(np.tril(similarities, k=-1) > 0.95))
-    return sentences, similarities
+    return similarities
 
 
 def summarize(text, model_name="large", additional_stopwords=None):
@@ -99,9 +102,11 @@ def summarize(text, model_name="large", additional_stopwords=None):
             # Gets a list of processed sentences.
             if paragraph:
                 tmp = _clean_text_by_sentences(
-                    paragraph, "english", additional_stopwords)
-                for sent in tmp:
+                    paragraph, additional_stopwords)
+                for j, sent in enumerate(tmp):
                     sent.paragraph = i
+                    # Hacky way to overwrite token
+                    sent.token = len(sentences) + j
                 sentences += tmp
     elif lang == "zh" or lang == "ko":  # zh-Hant sometimes got misclassified into ko
         raise NotImplementedError("Not supported yet： zh.")
@@ -112,9 +117,9 @@ def summarize(text, model_name="large", additional_stopwords=None):
 
     # print([sentence.token for sentence in sentences if sentence.token])
     # Creates the graph and calculates the similarity coefficient for every pair of nodes.
-    sentences, similarities = attach_setence_embeddings(
+    similarities = attach_setence_embeddings(
         sentences, batch_size=32, model_name=model_name)
-    graph = _build_graph(list(range(len(sentences))))
+    graph = _build_graph([x.token for x in sentences])
     _set_graph_edge_weights(graph, partial(cosine_similarity, similarities))
 
     # Remove all nodes with all edges weights equal to zero.
@@ -136,11 +141,10 @@ def summarize(text, model_name="large", additional_stopwords=None):
 
 
 if __name__ == "__main__":
-    res, _, lang = summarize("""Of all of President Trump’s former associates who have come under scrutiny in the special counsel’s Russia investigation, his former personal lawyer, Michael D. Cohen, has undertaken perhaps the most surprising and risky legal strategy.
+    res, _, lang = summarize("""
+By Wednesday morning, he was trying to marry those two thoughts into a single message — both embracing the report and trashing it. “The Mueller Report, despite being written by Angry Democrats and Trump Haters, and with unlimited money behind it ($35,000,000), didn’t lay a glove on me,” he wrote. “I DID NOTHING WRONG.”
 
-Mr. Cohen has twice pleaded guilty in federal court in Manhattan to a litany of crimes, and he has volunteered information to the special counsel and other agencies investigating Mr. Trump and his inner circle. He did all this without first obtaining a traditional, ironclad deal under which the government would commit to seeking leniency on Mr. Cohen’s behalf when he is sentenced on Dec. 12.
-
-Mr. Cohen has concluded that his life has been utterly destroyed by his relationship with Mr. Trump and his own actions, and to begin anew he needed to speed up the legal process by quickly confessing his crimes and serving any sentence he receives, according to his friends and associates, and analysis of documents in the case.""")
+In subsequent tweets, he tried again to claim victory amid his victimhood, casting the investigation as a contest in which he prevailed. In terms rarely used regarding a criminal investigation, he asserted that “We waited for Mueller and WON” and denounced “the Witch Hunt, which I have already won.”""")
     assert lang == "en"
     for row in res:
         print(f"{row.score} {row.text}")
diff --git a/text_cleaning_en.py b/text_cleaning_en.py
@@ -0,0 +1,19 @@
+import spacy
+
+from summa.preprocessing.textcleaner import init_textcleanner, filter_words, merge_syntactic_units
+
+NLP = spacy.load("en_core_web_sm")
+
+
+def split_sentences(text):
+    doc = NLP(text)
+    return [sent.text.strip() for sent in doc.sents]
+
+
+def clean_text_by_sentences(text, additional_stopwords=None):
+    """Tokenizes a given text into sentences, applying filters and lemmatizing them.
+    Returns a SyntacticUnit list. """
+    init_textcleanner("english", additional_stopwords)
+    original_sentences = split_sentences(text)
+    filtered_sentences = filter_words(original_sentences)
+    return merge_syntactic_units(original_sentences, filtered_sentences)