In [1]:
'''
Automatic summarization using Gensim module.
This summarizer is based on the improved "TextRank" algorithm,
and uses "BM25 ranking function".

To install:
conda install -c anaconda gensim=0.12.4

Extract articles from on of the trusted websites: 
http://www.psychiatrictimes.com/

'''

from gensim.summarization import summarize, keywords
import lxml.html as html

# using of NLTK for removing of stop words, stemming and lemmatization
import nltk
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer

# download corpuses
nltk.download("stopwords")
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kvoronaya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kvoronaya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
base_url = 'http://www.psychiatrictimes.com'

# interested only in schizophrenia and ADHD topics
schizophrenia_path = base_url + '/schizophrenia'
adhd_path = base_url + '/adhd'

In [7]:
def get_corpus_specific_topic(path):
    schizophrenia_corpus = {}
    adhd_corpus = {}
    
    main_page = html.parse(path)
    url_xpath = '//div[contains(@class, "pane-content-arguments-panel-pane-")]//div[contains(@class, "field-name-title")]//a/@href'
    # will extract articles which posted on the main page (so recent articles)
    articles_urls = main_page.getroot().xpath(url_xpath)
    
    articles = []
    titles = []
    authors = []
    publication_dates = []

    for url in articles_urls:
        page = html.parse(base_url + url)

        article_title = page.getroot().xpath('//div[contains(@class, "pane-page-title")]//h1/text()')
        titles.append(article_title)

        article_author = page.getroot().xpath('//div[@class="article-author"]//a/text()')
        authors.append(article_author)

        publication_date = page.getroot().xpath('//div[contains(@class, "article-info")]//div[@class="pane-content"]/text()')
        publication_dates.append(publication_date)

        full_article_text = page.getroot().xpath('//div[contains(@class, "field-name-body")]//p/text()')
        full_article_text = ''.join(full_article_text)
        articles.append(full_article_text)
    
    if 'schizophrenia' in path:
        schizophrenia_corpus['articles'] = articles
        schizophrenia_corpus['titles'] = titles
        schizophrenia_corpus['authors'] = authors
        schizophrenia_corpus['publication_dates'] = publication_dates
        return schizophrenia_corpus

    if 'adhd' in path:
        adhd_corpus['articles'] = articles
        adhd_corpus['titles'] = titles
        adhd_corpus['authors'] = authors
        adhd_corpus['publication_dates'] = publication_dates
        return adhd_corpus
    

In [8]:
schizophrenia_corpus = get_corpus_specific_topic(schizophrenia_path)

In [9]:
adhd_corpus = get_corpus_specific_topic(adhd_path)

In [14]:
print('Today received {} new articles for topic schizophrenia.'.format(len(schizophrenia_corpus['articles'])))

Today received 7 new articles for topic schizophrenia.


In [15]:
print('Today received {} new articles for topic ADHD.'.format(len(adhd_corpus['articles'])))

Today received 7 new articles for topic ADHD.


In [22]:
def print_article_schizophrenia_corpus(article_number):
    print('********** Example of one of the article: *********')
    print('The title is: {}'.format(schizophrenia_corpus['titles'][article_number][0]))
    print('The authors are: {}'.format(schizophrenia_corpus['authors'][article_number][0]))
    print('The date of publication is: {}'.format(schizophrenia_corpus['publication_dates'][article_number][0]))
    print(schizophrenia_corpus['articles'][article_number])
    
def print_article_adhd_corpus(article_number):
    print('********** Example of one of the article: *********')
    print('The title is: {}'.format(adhd_corpus['titles'][article_number][0]))
    print('The authors are: {}'.format(adhd_corpus['authors'][article_number][0]))
    print('The date of publication is: {}'.format(adhd_corpus['publication_dates'][article_number][0]))
    print(adhd_corpus['articles'][article_number])  

In [23]:
print_article_schizophrenia_corpus(0)

********** Example of one of the article: *********
The title is: Adjunctive Topiramate in People With Schizophrenia
The authors are: Brian Miller, MD, PhD, MPH
The date of publication is: 
    September 29, 2016

Many patients with schizophrenia experience residual symptoms despite currently available treatments that affect quality of life and overall function. Treatment with a variety of different agents—as adjuncts to antipsychotics—has either failed to show consistent, robust effects on psychopathology, or needs replication in larger studies.By contrast, several pharmacologic strategies, including adjunctive topiramate, have been successful in reducing antipsychotic-induced weight gain.Topiramate is approved by the US FDA as an anti-epileptic and anti-migraine treatment. In patients with epilepsy and obesity and/or type 2 diabetes mellitus, topiramate has been associated with weight loss and improved glucose homeostasis, potentially through appetite reduction.Previous quantitative 

In [24]:
print_article_adhd_corpus(2)

********** Example of one of the article: *********
The title is: Understanding the Link Between Lead Toxicity and ADHD
The authors are: Joel T. Nigg, PhD
The date of publication is: 
    September 30, 2016
Several neurotoxic chemicals can disrupt brain development, which contributes to neurodevelopmental and psychiatric disorders—including ADHD. Lead is among the most studied neurotoxicants relevant to mental disorders. Because lead is stable and inert, the total amount on earth never changes. Over the past 6000 years, people have mined about 300 million tons of lead; some 150 million tons are still dispersed in the environment in one form or another.

Most exposure in children in the US (about 70%) occurs through lead paint in older houses, schools, and other buildings; or in surrounding soil and dust, which has accumulated and bound lead over the decades from airborne pollution. Other sources of exposure include water (leaching from lead in pipes, as in the recent Flint water crisis

In [32]:
def pre_process(list_words_from_article):
    # remove stop-words
    filtered_article_words = [word for word in list_words_from_article if word not in stopwords.words('english')]

    wordnet_lemmatizer = WordNetLemmatizer()

    # lemmatization process
    lemmatization_words = []
    for word in filtered_article_words:
        lemm_word = wordnet_lemmatizer.lemmatize(word)
        lemmatization_words.append(lemm_word)
    
    return lemmatization_words


def pre_process_articles_chizophrenia_corpus(article_number):
    list_words_from_article = schizophrenia_corpus['articles'][article_number].split()
    return pre_process(list_words_from_article)


def pre_process_articles_adhd_corpus(article_number):
    list_words_from_article = adhd_corpus['articles'][article_number].split()
    return pre_process(list_words_from_article)


def get_keywords(modified_article, number_keywords):
    print (' ******* Extracted Keywords ******* ')
    extracted_keywords =  keywords(modified_article, words=number_keywords, scores=True, lemmatize=True)
    for keyword in extracted_keywords:
        print('{} - {}'.format(keyword[0], keyword[1]))
        

def get_summary_chizophrenia_corpus(article_number, summary_len, number_keywords):
    modified_article = ' '.join(pre_process_articles_chizophrenia_corpus(article_number))
    get_keywords(modified_article, number_keywords)
    print(' ******* Summary ******* ')
    summary = summarize(schizophrenia_corpus['articles'][article_number], word_count=summary_len)
    print(summary)
    
def get_summary_adhd_corpus(article_number, summary_len, number_keywords):
    modified_article = ' '.join(pre_process_articles_adhd_corpus(article_number))
    get_keywords(modified_article, number_keywords)
    print(' ******* Summary ******* ')
    summary = summarize(adhd_corpus['articles'][article_number], word_count=summary_len)
    print(summary)
    

In [33]:
get_summary_chizophrenia_corpus(0, 100, 5)

 ******* Extracted Keywords ******* 
topiramate - [ 0.39952913]
trials - [ 0.22944226]
patients - [ 0.21032806]
antipsychotic - [ 0.20812922]
effect - [ 0.20145513]
 ******* Summary ******* 
In a systematic search of PubMed/MEDLINE, the researchers looked for all published studies of antipsychotic augmentation with topiramate in patients with schizophrenia-spectrum disorders (both randomized, placebo-controlled trials or open-label trials with an untreated control group).The primary outcome was change in total score on either the Positive and Negative Syndrome Scale (PANSS) or the Brief Psychiatric Rating Scale (BPRS).
There was a trend for more paresthesias with topiramate use (relative risk = 2.0), but otherwise no difference in adverse effects reported in at least 3 trials.The authors found evidence that adjunctive topiramate was associated with significantly greater reductions in psychopathology (particularly in clozapine-treated patients) and body weight.Other than an increase in 

In [35]:
get_summary_adhd_corpus(2, 120, 7)

 ******* Extracted Keywords ******* 
leaded - [ 0.56953309]
exposure child - [ 0.19004248]
levels - [ 0.18172189]
effect - [ 0.15911143]
paint - [ 0.12044835]
health - [ 0.11600123]
 ******* Summary ******* 
By the 1920s lead’s harmful effect on child development was medically established, and several nations had begun to restrict or phase out lead paint.
Lead use was finally restricted in the US in the 1970s and phased out of gasoline and paint by 1986, which reduced the average lead level among children to about 1.0 µg/dL by the 2000s.
Children in nations that do not regulate lead have exposure and blood lead levels higher than in the US as well.
While many studies in the literature and the meta-analysis by Goodlad and colleagues assayed lead levels that were higher than are now common among the US population, several studies using varying methodology from 2005 to 2015 confirmed that blood lead level was associated with ADHD even at levels in the 0.5 to 3.0 µg/dL range, after control

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kvoronaya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [41]:
# remove morphological affixes from words, leaving only the word stem
stemmer = nltk.stem.porter.PorterStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    tokens = [i for i in tokens if i not in string.punctuation]
    stems = stem_tokens(tokens, stemmer)
    return stems

# TF-IDF matrix for corpus 
corpus_tfidf = TfidfVectorizer(stop_words='english', tokenizer=tokenize)
corpus_representation = corpus_tfidf.fit_transform(schizophrenia_corpus['articles'][0])

feature = corpus_tfidf.get_feature_names()