### Automatic summarization of science publication texts, machine learning based.

#### Including:
- Text Preprocessing
- KeyWords extraction
- Syntactic Parsing. Part of speech tagging
- Word Embedding (text vectors). 
- Word2Vec Model Visualisation
- TF – IDF matrices 
- Search cosine similarity between documents
- Five different algorithms for construction of summaries: 
    improved "TextRank" from Gensim, usual TextRank, LSA, Kullback–Leibler, LexRank

In [1]:
# Word2Vec model (word embeddings).
from gensim.models import Word2Vec

# This summarizer is based on the improved "TextRank" algorithm, and uses "BM25 ranking function".
from gensim.summarization import summarize, keywords

# Four different algorithms: usual TextRank, LSA, Kullback–Leibler, LexRank.
from sumy.summarizers.text_rank import TextRankSummarizer as TextRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.summarizers.kl import KLSummarizer as KL
from sumy.summarizers.lex_rank import LexRankSummarizer as LRS

# Used for text preprocessing.
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

# ROUGE - set of metrics used for evaluating automatic summarization.
import sumy.evaluation.rouge as Rogue

# Using for search cosine similarity between documents - articles.
from sklearn.metrics.pairwise import cosine_similarity

# Term Frequency – Inverse Document Frequency (TF – IDF).
from sklearn.feature_extraction.text import TfidfVectorizer

# t-distributed Stochastic Neighbor Embedding.
from sklearn.manifold import TSNE

# Used for text preprocessing and part of speech tagging.
import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer

# Download NLTK corpuses.
nltk.download("stopwords")
nltk.download('wordnet')

import os
import string

# Used for data representation.
import pandas as pd
import numpy as np

# Used for visualisation.
import matplotlib.pyplot as plt
import pylab as pyl

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kvoronaya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kvoronaya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
PATH_TEST_SET = 'G:/vkr_docs_to_start/shizic_articles_test_set/test_set_vkr/'

In [3]:
file_name_mapping = {}
test_set = []
count = 0

for file in os.listdir(PATH_TEST_SET):
    if file.endswith(".txt"):
        file_name_mapping[count] = file
        count += 1
        f = open(PATH_TEST_SET + file, 'r')
        for line in f:
            line = line.decode("ascii","ignore")
            # document should always contain only one line
            test_set.append(line)

print('***** File Name Mapping *****')           
for k, v in file_name_mapping.items():
    print('{}: "{}"'.format(k, v))

***** File Name Mapping *****
0: "Use of Acetaminophen (Paracetamol) During Pregnancy .txt"
1: "Care for Adolescents with Depression in Primary Care Settings.txt"
2: "Short-term Suicide Risk After Psychiatric Hospital Discharge.txt"
3: "Association of Hormonal Contraception With Depression.txt"
4: "Exaggerated Acquisition and Resistance to Extinction of Avoidance Behavior in Treated Heroin-Dependent Men.txt"
5: "Efficacy, Acceptability, and Tolerability of Antipsychotics in Treatment-Resistant Schizophrenia.txt"
6: "Cigarette Smoking and the Onset and Persistence of Panic Attacks During Mid-Adulthood in the United States.txt"
7: "Behavioral Interventions for Antipsychotic Medication Associated Obesity.txt"
8: "Treatment Preferences of Psychotherapy Patients with Chronic PTSD.txt"
9: "Efficacy of Topiramate in the Treatment of Crack Cocaine Dependence.txt"


#### -------------------------- Text Preprocessing ---------------------------------------------------------------------------------------------------------------------

In [4]:
def text_preprocessing(test_set):
    prepared_test_set = list()
    for document in test_set:
        
        # tokenization – process of converting a text into tokens
        tokens = word_tokenize(document)
        
        # remove stop-words
        filtered_doc_words = [word for word in tokens if word not in stopwords.words('english')]
        
        # lemmatization process - procedure of obtaining the root form of the word
        wordnet_lemmatizer = WordNetLemmatizer()
        lemmatization_words = []
        for word in filtered_doc_words:
            lemm_word = wordnet_lemmatizer.lemmatize(word)
            lemmatization_words.append(lemm_word)
        
        prepared_doc = ' '.join(lemmatization_words)
        prepared_test_set.append(prepared_doc)
        
    return prepared_test_set

prepared_test_set = text_preprocessing(test_set)    

#### -------------------------- KeyWords extraction ----------------------------------------------------------------------------

In [5]:
number_keywords = 10
keywords_test_set = {}
index = 0

for doc in prepared_test_set:
    extracted_keywords =  keywords(doc, words=number_keywords, scores=True, lemmatize=True)
    keywords_test_set[index] = extracted_keywords
    index += 1

dataframe = list()  
for value in keywords_test_set.values():
    dataframe.append(value)

In [7]:
KEYWORDS = pd.DataFrame(dataframe, columns=['KeyWord & Score']*10)
KEYWORDS

Unnamed: 0,KeyWord & Score,KeyWord & Score.1,KeyWord & Score.2,KeyWord & Score.3,KeyWord & Score.4,KeyWord & Score.5,KeyWord & Score.6,KeyWord & Score.7,KeyWord & Score.8,KeyWord & Score.9
0,"(study, [0.318763258672])","(adhd, [0.288811247658])","(pregnancy, [0.271242939913])","(risk, [0.264390659747])","(acetaminophen, [0.224141572013])","(attention, [0.170819185708])","(use, [0.165406795656])","(offspring, [0.139080819912])","(infection, [0.137813464181])","(outcome, [0.136368161902])"
1,"(costs, [0.3320741284])","(care, [0.276727789845])","(depression, [0.25152355147])","(effective, [0.246144203219])","(adolescents, [0.245083075426])","(health, [0.224768702332])","(intervention, [0.20406019158])","(group, [0.173564242552])","(qalys, [0.14849737834])","(collaborative, [0.146138270538])"
2,"(disorder, [0.400835511967])","(suicide, [0.328866517918])","(inpatient, [0.244975362963])","(cohort, [0.22359309293])","(adult, [0.215556334549])","(year, [0.211287295335])","(discharge, [0.177299101596])","(diagnosis, [0.155841064908])","(hospital, [0.153777114842])","(death, [0.143028922545])"
3,"(user, [0.267216429183])","(contraceptive, [0.257075094941])","(use, [0.245066868162])","(woman, [0.212146079792])","(associated, [0.177051729335])","(psychiatric, [0.169170011267])","(diagnosis, [0.168346994999])","(hormonal, [0.163656483025])","(january, [0.144412153096])","(year, [0.141149162559])"
4,"(avoidance, [0.33637819377])","(opioid, [0.260168854614])","(behavioral, [0.250658044231])","(task, [0.183852159204])","(hiding, [0.171616781522])","(different, [0.170747799566])","(dependence, [0.163464685072])","(abnormal, [0.146285660347])","(aversive event, [0.14605990018])",
5,"(clozapine, [0.299238648487])","(antipsychotic, [0.270985047024])","(effective, [0.249449074991])","(treatment, [0.234353855335])","(trials, [0.195890011285])","(schizophrenia, [0.193089950974])","(evidence, [0.155805473829])","(rcts, [0.155587895934])","(randomized, [0.147880608834])","(change, [0.146491841048])"
6,"(smoking, [0.325133307891])","(year, [0.241450876136])","(attacks, [0.185775123445])","(onset, [0.184524269285])","(risk, [0.183135987694])","(data, [0.175294298524])","(wave, [0.167876710833])","(panic, [0.158807946924])","(based, [0.15588890665])","(united, [0.155520409927])"
7,"(control, [0.234662902125])","(smi, [0.224329601417])","(medication, [0.166859435553])","(class, [0.166206461617])","(group, [0.166202166359])","(treatment, [0.164366642275])","(effective, [0.16394636219])","(interventions, [0.161796517003])","(knowledge, [0.158982657108])","(counseling, [0.157518926393])"
8,"(preferences, [0.422235830408])","(patients, [0.370151609122])","(treatment, [0.331900856392])","(outcome, [0.251996545759])","(psychotherapy, [0.232550057254])","(ptsd, [0.196338779899])","(depressed, [0.146501862787])","(clinical, [0.138754563027])","(research, [0.131335451141])","(chronic, [0.123320049096])"
9,"(cocaine, [0.371003971018])","(group, [0.313540014415])","(topiramate, [0.283979155483])","(subject, [0.251905802251])","(studied, [0.248619372311])","(use, [0.216307526629])","(treatment, [0.203472372092])","(placebo, [0.140252119317])","(week, [0.13265122596])","(control, [0.129466380513])"


#### -------------------------- Syntactic Parsing. Part of speech tagging. ---------------------------------------------------------------------------------------

Universal Part-of-Speech Tagset:

CC: conjunction, coordinating

CD: numeral, cardinal

DT: determiner

IN: preposition or conjunction, subordinating

JJ: adjective or numeral, ordinal

JJR: adjective, comparative

JJS: adjective, superlative

LS: list item marker

MD: modal auxiliary

NN: noun, common, singular or mass

NNP: noun, proper, singular

NNS: noun, common, plural

PDT: pre-determiner

POS: genitive marker

PRP: pronoun, personal

RB: adverb

RBR: adverb, comparative

RBS: adverb, superlative

RP: particle

UH: interjection

VB: verb, base form

VBD: verb, past tense

VBG: verb, present participle or gerund

VBN: verb, past participle

VBP: verb, present tense, not 3rd person singular

VBZ: verb, present tense, 3rd person singular

WDT: WH-determiner

WP: WH-pronoun

WRB: Wh-adverb

In [8]:
partofspeech_tagging_all = list()
for doc in prepared_test_set:
    tokens = word_tokenize(doc)
    partofspeech_tagging_doc = pos_tag(tokens)
    partofspeech_tagging_all.append(partofspeech_tagging_doc)

# for example, to print tagged text, for '3' article
partofspeech_tagging_all[3]

[(u'Association', 'NNP'),
 (u'Hormonal', 'NNP'),
 (u'Contraception', 'NNP'),
 (u'With', 'IN'),
 (u'Depression', 'NNP'),
 (u'.', '.'),
 (u'Is', 'VBZ'),
 (u'use', 'JJ'),
 (u'hormonal', 'JJ'),
 (u'contraception', 'NN'),
 (u'associated', 'VBN'),
 (u'treatment', 'NN'),
 (u'depression', 'NN'),
 (u'?', '.'),
 (u'In', 'IN'),
 (u'nationwide', 'JJ'),
 (u'prospective', 'JJ'),
 (u'cohort', 'NN'),
 (u'study', 'VBD'),
 (u'1', 'CD'),
 (u'million', 'CD'),
 (u'woman', 'NN'),
 (u'living', 'VBG'),
 (u'Denmark', 'NNP'),
 (u',', ','),
 (u'increased', 'VBD'),
 (u'risk', 'NN'),
 (u'first', 'RB'),
 (u'use', 'NN'),
 (u'antidepressant', 'JJ'),
 (u'first', 'JJ'),
 (u'diagnosis', 'NN'),
 (u'depression', 'NN'),
 (u'found', 'VBD'),
 (u'among', 'IN'),
 (u'user', 'JJ'),
 (u'different', 'JJ'),
 (u'type', 'NN'),
 (u'hormonal', 'JJ'),
 (u'contraception', 'NN'),
 (u',', ','),
 (u'highest', 'JJS'),
 (u'rate', 'NN'),
 (u'among', 'IN'),
 (u'adolescent', 'JJ'),
 (u'.', '.'),
 (u'Health', 'NNP'),
 (u'care', 'NN'),
 (u'profess

#### -------------------------- Word Embedding (text vectors). Word2Vec Model. ---------------------

In [9]:
tsne = TSNE(n_components=2, random_state=0)

In [10]:
input2model = list ()
for doc in dataframe:
    words_doc_list = []
    for word in doc:
        words_doc_list.append(word[0])
    input2model.append(words_doc_list)

In [11]:
# train the model on your corpus  
model = Word2Vec(input2model, min_count = 1, workers=4)
model_keys = model.vocab.keys()

model_values = []
for key in model_keys:
    model_values.append(model[key])

model_values = np.array(model_values)
model_values.shape



(84L, 100L)

In [12]:
reduced_matrix = tsne.fit_transform(model_values)

pyl.figure(figsize=(200, 200), dpi=100)
max_x = np.amax(reduced_matrix, axis=0)[0]
max_y = np.amax(reduced_matrix, axis=0)[1]
pyl.xlim((-max_x,max_x))
pyl.ylim((-max_y,max_y))

pyl.scatter(reduced_matrix[:, 0], reduced_matrix[:, 1], 20)

for w in range(0, len(model_keys)):
    target_word = model_keys[w]
    x = reduced_matrix[w, 0]
    y = reduced_matrix[w, 1]
    pyl.annotate(target_word, (x,y))
    
pyl.title('Word2Vec Model')
pyl.grid()
pyl.show()

  if self._edgecolors == str('face'):


#### -------------------------- TF – IDF matrices and Search cosine similarity between documents ---------------------

In [13]:
# remove morphological affixes from words, leaving only the word stem
stemmer = nltk.stem.porter.PorterStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    tokens = [i for i in tokens if i not in string.punctuation]
    stems = stem_tokens(tokens, stemmer)
    return stems

# TF-IDF matrix for corpus 
corpus_tfidf = TfidfVectorizer(stop_words='english', tokenizer=tokenize)
corpus_representation = corpus_tfidf.fit_transform(prepared_test_set)

feature = corpus_tfidf.get_feature_names()

corpus_representation_arr = corpus_representation.toarray()
    

In [14]:
def cosine_similarity_results(query_doc):
    tfidf_query_doc = corpus_tfidf.transform([query_doc])
    tfidf_query_doc = tfidf_query_doc.toarray()
    row_i = []
    for i in xrange(len(prepared_test_set)):
        cosine_dist = cosine_similarity(
            corpus_representation_arr[i].reshape(1,-1), tfidf_query_doc.reshape(1,-1))
        row_i.append(cosine_dist[0][0])

    cosine_sim_data.append(row_i)

    
cosine_sim_data = list()    
for doc in prepared_test_set:
    cosine_similarity_results(doc)
    
    
cosine_sim_data = pd.DataFrame(cosine_sim_data)
cosine_sim_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.023949,0.087947,0.058019,0.059277,0.042445,0.069383,0.02736,0.034432,0.043167
1,0.023949,1.0,0.105139,0.180694,0.031528,0.105794,0.063669,0.218488,0.056997,0.074725
2,0.087947,0.105139,1.0,0.13295,0.026008,0.057755,0.074677,0.038655,0.048437,0.023775
3,0.058019,0.180694,0.13295,1.0,0.022881,0.056621,0.113921,0.034089,0.022842,0.050035
4,0.059277,0.031528,0.026008,0.022881,1.0,0.03964,0.037672,0.132686,0.055159,0.081207
5,0.042445,0.105794,0.057755,0.056621,0.03964,1.0,0.028964,0.160558,0.089039,0.08996
6,0.069383,0.063669,0.074677,0.113921,0.037672,0.028964,1.0,0.02661,0.007189,0.041659
7,0.02736,0.218488,0.038655,0.034089,0.132686,0.160558,0.02661,1.0,0.098097,0.12427
8,0.034432,0.056997,0.048437,0.022842,0.055159,0.089039,0.007189,0.098097,1.0,0.076625
9,0.043167,0.074725,0.023775,0.050035,0.081207,0.08996,0.041659,0.12427,0.076625,1.0


In [15]:
reduced_tfidf_matr = tsne.fit_transform(corpus_representation_arr)

labels = list()
for i in xrange(len(prepared_test_set)):
    target_label = 'doc.#{}'.format(i)
    labels.append(target_label)

    
pyl.figure(figsize=(200, 200), dpi=100)
max_x = np.amax(reduced_tfidf_matr, axis=0)[0]
max_y = np.amax(reduced_tfidf_matr, axis=0)[1]
pyl.xlim((-max_x,max_x))
pyl.ylim((-max_y,max_y))

pyl.scatter(reduced_tfidf_matr[:, 0], reduced_tfidf_matr[:, 1], 20)

for doc in xrange(len(prepared_test_set)):
    target_label = labels[doc]
    x = reduced_tfidf_matr[doc, 0]
    y = reduced_tfidf_matr[doc, 1]
    pyl.annotate(target_label, (x,y))

pyl.grid()
pyl.show()    

#### -------------------------- Summary by improved "TextRank" --------------------------------------------------

In [16]:
summary_len_words = 100
count = 0

for doc in test_set:
    summary = summarize(doc, word_count=summary_len_words)
    print('\n\n ****************** Summary for doc.#{} ********************* '.format(count))
    count += 1
    print(summary)





 ****************** Summary for doc.#0 ********************* 
Several small and large prospective studies have found an association between gestational acetaminophen exposure and attention-deficit/hyperactivity disorder (ADHD)-like behaviors, use of ADHD medication, and ADHD diagnoses in offspring during childhood; the only negative study was a small investigation that examined only one aspect of attention as an outcome.
However, since fever during pregnancy may itself be associated with adverse gestational outcomes, given the present level of uncertainty about the ADHD risk with acetaminophen, it is suggested that, until more data are available, the use of acetaminophen in pregnancy should not be denied in situations in which the need for the drug is clear.


 ****************** Summary for doc.#1 ********************* 
A randomized clinical trial conducted at 9 primary care clinics in Washington State suggests that collaborative care results in an increase of 0.04 quality-adjusted 

#### -------------------------- Summaries by algorithms: usual TextRank, LSA, Kullback–Leibler, LexRank ------------------------------------------

In [17]:
# to regulate the length of summary
SENTENCES_COUNT = 3

stemmer = Stemmer("english")
tr = [0.0, 0.0, 0.0]
lsa = [0.0, 0.0, 0.0]
kl = [0.0, 0.0, 0.0]
lrs = [0.0, 0.0, 0.0]
metrics = [tr, lsa, kl, lrs]
summaries_mapping = {0: 'usual TextRank', 1: 'LSA', 2: 'Kullback–Leibler', 3: 'LexRank'}

for file in os.listdir(PATH_TEST_SET):
    if file.endswith(".txt"):
        print('\n \n ************ Summaries for file {}. ************'.format(file))
        parser = PlaintextParser.from_file(PATH_TEST_SET + file, Tokenizer("english"))
        summarizers = [
            TextRankSummarizer(stemmer), 
            Summarizer(stemmer), 
            KL(stemmer), 
            LRS(stemmer)
        ]
     
        for summarizer in summarizers:
            summarizer.stop_words = get_stop_words("english")
        
        summaries = [[]] * 4
        for i in range(len(summarizers)):
            print('\n --------- Summary by {} algorithm ---------'.format(summaries_mapping[i]))
            for sentence in summarizers[i](parser.document, SENTENCES_COUNT):
                summaries[i].append(sentence)
                print(sentence)

            metrics[i][0] += Rogue.rouge_1(summaries[i], parser.document.sentences)
            metrics[i][1] += Rogue.rouge_2(summaries[i], parser.document.sentences)
            metrics[i][2] += Rogue.rouge_n(summaries[i], parser.document.sentences, 3)


 
 ************ Summaries for file Use of Acetaminophen (Paracetamol) During Pregnancy .txt. ************

 --------- Summary by usual TextRank algorithm ---------
Several small and large prospective studies have found an association between gestational acetaminophen exposure and attention-deficit/hyperactivity disorder (ADHD)-like behaviors, use of ADHD medication, and ADHD diagnoses in offspring during childhood; the only negative study was a small investigation that examined only one aspect of attention as an outcome.
In the light of the finding of a single study that infection and fever during pregnancy by themselves do not raise the ADHD risk, it appears possible that the use of acetaminophen during pregnancy is itself responsible for the increased risk of ADHD.
However, since fever during pregnancy may itself be associated with adverse gestational outcomes, given the present level of uncertainty about the ADHD risk with acetaminophen, it is suggested that, until more data are av

In [18]:
labels = ['TextRank', 'LSA', 'KL', 'LexRank']
for i, metric in enumerate(metrics):
    plt.plot([1,2,3], metric, linewidth=1, label=labels[i])
    
plt.grid(True)
plt.axis('tight')
plt.title('Summarization quality')
plt.xlabel('ROUGE metric')
plt.ylabel('Score')
plt.legend()
plt.savefig('G:/vkr_docs_to_start/shizic_articles_test_set/summarization_ROUGE.pdf')