In [34]:
import spacy
from collections import Counter
import pandas as pd
import numpy as np
from heapq import nlargest
from transformers import pipeline

# Extractive Summarization

In [2]:
text = '''Small data are often used in scientific and engineering research due to the presence of various constraints, such as time, cost, ethics, privacy, security, and technical limitations in data acquisition. However, big data have been the focus for the past decade, small data and their challenges have received little attention, even though they are technically more severe in machine learning (ML) and deep learning (DL) studies. Overall, the small data challenge is often compounded by issues, such as data diversity, imputation, noise, imbalance, and high-dimensionality. Fortunately, the current big data era is characterized by technological breakthroughs in ML, DL, and artificial intelligence (AI), which enable data-driven scientific discovery, and many advanced ML and DL technologies developed for big data have inadvertently provided solutions for small data problems. As a result, significant progress has been made in ML and DL for small data challenges in the past decade. In this review, we summarize and analyze several emerging potential solutions to small data challenges in molecular science, including chemical and biological sciences. We review both basic machine learning algorithms, such as linear regression, logistic regression (LR), k-nearest neighbor (KNN), support vector machine (SVM), kernel learning (KL), random forest (RF), and gradient boosting trees (GBT), and more advanced techniques, including artificial neural network (ANN), convolutional neural network (CNN), U-Net, graph neural network (GNN), Generative Adversarial Network (GAN), long short-term memory (LSTM), autoencoder, transformer, transfer learning, active learning, graph-based semi-supervised learning, combining deep learning with traditional machine learning, and physical model-based data augmentation. We also briefly discuss the latest advances in these methods. Finally, we conclude the survey with a discussion of promising trends in small data challenges in molecular science.'''

In [3]:
len(text)

1983

In [4]:
nlp = spacy.load('en_core_web_sm')

In [5]:
doc = nlp(text)

In [9]:
tokens = [token.text.lower() for token in doc
         if not token.is_punct and
         not token.is_stop and
         not token == '\n']

In [10]:
tokens[:10]

['small',
 'data',
 'scientific',
 'engineering',
 'research',
 'presence',
 'constraints',
 'time',
 'cost',
 'ethics']

In [11]:
word_freq = Counter(tokens)

In [13]:
word_freq

Counter({'small': 7,
         'data': 14,
         'scientific': 2,
         'engineering': 1,
         'research': 1,
         'presence': 1,
         'constraints': 1,
         'time': 1,
         'cost': 1,
         'ethics': 1,
         'privacy': 1,
         'security': 1,
         'technical': 1,
         'limitations': 1,
         'acquisition': 1,
         'big': 3,
         'focus': 1,
         'past': 2,
         'decade': 2,
         'challenges': 4,
         'received': 1,
         'little': 1,
         'attention': 1,
         'technically': 1,
         'severe': 1,
         'machine': 4,
         'learning': 9,
         'ml': 4,
         'deep': 2,
         'dl': 4,
         'studies': 1,
         'overall': 1,
         'challenge': 1,
         'compounded': 1,
         'issues': 1,
         'diversity': 1,
         'imputation': 1,
         'noise': 1,
         'imbalance': 1,
         'high': 1,
         'dimensionality': 1,
         'fortunately': 1,
         'current'

In [14]:
max_freq = max(word_freq.values())

In [15]:
max_freq

14

In [16]:
for word in word_freq.keys():
    word_freq[word] = word_freq[word] / max_freq

In [17]:
word_freq

Counter({'small': 0.5,
         'data': 1.0,
         'scientific': 0.14285714285714285,
         'engineering': 0.07142857142857142,
         'research': 0.07142857142857142,
         'presence': 0.07142857142857142,
         'constraints': 0.07142857142857142,
         'time': 0.07142857142857142,
         'cost': 0.07142857142857142,
         'ethics': 0.07142857142857142,
         'privacy': 0.07142857142857142,
         'security': 0.07142857142857142,
         'technical': 0.07142857142857142,
         'limitations': 0.07142857142857142,
         'acquisition': 0.07142857142857142,
         'big': 0.21428571428571427,
         'focus': 0.07142857142857142,
         'past': 0.14285714285714285,
         'decade': 0.14285714285714285,
         'challenges': 0.2857142857142857,
         'received': 0.07142857142857142,
         'little': 0.07142857142857142,
         'attention': 0.07142857142857142,
         'technically': 0.07142857142857142,
         'severe': 0.07142857142857142

In [18]:
sent_token = [sent.text for sent in doc.sents]

In [19]:
sent_token

['Small data are often used in scientific and engineering research due to the presence of various constraints, such as time, cost, ethics, privacy, security, and technical limitations in data acquisition.',
 'However, big data have been the focus for the past decade, small data and their challenges have received little attention, even though they are technically more severe in machine learning (ML) and deep learning (DL) studies.',
 'Overall, the small data challenge is often compounded by issues, such as data diversity, imputation, noise, imbalance, and high-dimensionality.',
 'Fortunately, the current big data era is characterized by technological breakthroughs in ML, DL, and artificial intelligence (AI), which enable data-driven scientific discovery, and many advanced ML and DL technologies developed for big data have inadvertently provided solutions for small data problems.',
 'As a result, significant progress has been made in ML and DL for small data challenges in the past decade

In [23]:
sent_score = {}
for sent in sent_token:
    for word in sent.split():
        if word in word_freq.keys():
            if sent not in sent_score.keys():
                sent_score[sent] = word_freq[word]
            else:
                sent_score[sent] += word_freq[word]

In [24]:
sent_score

{'Small data are often used in scientific and engineering research due to the presence of various constraints, such as time, cost, ethics, privacy, security, and technical limitations in data acquisition.': 2.5,
 'However, big data have been the focus for the past decade, small data and their challenges have received little attention, even though they are technically more severe in machine learning (ML) and deep learning (DL) studies.': 5.214285714285715,
 'Overall, the small data challenge is often compounded by issues, such as data diversity, imputation, noise, imbalance, and high-dimensionality.': 2.642857142857143,
 'Fortunately, the current big data era is characterized by technological breakthroughs in ML, DL, and artificial intelligence (AI), which enable data-driven scientific discovery, and many advanced ML and DL technologies developed for big data have inadvertently provided solutions for small data problems.': 5.2857142857142865,
 'As a result, significant progress has been

In [27]:
num_sentences = 3
n = nlargest(num_sentences, sent_score, key = sent_score.get)

In [28]:
n

['We review both basic machine learning algorithms, such as linear regression, logistic regression (LR), k-nearest neighbor (KNN), support vector machine (SVM), kernel learning (KL), random forest (RF), and gradient boosting trees (GBT), and more advanced techniques, including artificial neural network (ANN), convolutional neural network (CNN), U-Net, graph neural network (GNN), Generative Adversarial Network (GAN), long short-term memory (LSTM), autoencoder, transformer, transfer learning, active learning, graph-based semi-supervised learning, combining deep learning with traditional machine learning, and physical model-based data augmentation.',
 'Fortunately, the current big data era is characterized by technological breakthroughs in ML, DL, and artificial intelligence (AI), which enable data-driven scientific discovery, and many advanced ML and DL technologies developed for big data have inadvertently provided solutions for small data problems.',
 'However, big data have been the f

In [29]:
ordered_sents = []
for sent in sent_token:
    for i in n:
        if sent == i:
            ordered_sents.append(sent)

In [30]:
ordered_sents

['However, big data have been the focus for the past decade, small data and their challenges have received little attention, even though they are technically more severe in machine learning (ML) and deep learning (DL) studies.',
 'Fortunately, the current big data era is characterized by technological breakthroughs in ML, DL, and artificial intelligence (AI), which enable data-driven scientific discovery, and many advanced ML and DL technologies developed for big data have inadvertently provided solutions for small data problems.',
 'We review both basic machine learning algorithms, such as linear regression, logistic regression (LR), k-nearest neighbor (KNN), support vector machine (SVM), kernel learning (KL), random forest (RF), and gradient boosting trees (GBT), and more advanced techniques, including artificial neural network (ANN), convolutional neural network (CNN), U-Net, graph neural network (GNN), Generative Adversarial Network (GAN), long short-term memory (LSTM), autoencoder

In [31]:
or_sen = ' '.join(ordered_sents)

In [32]:
or_sen

'However, big data have been the focus for the past decade, small data and their challenges have received little attention, even though they are technically more severe in machine learning (ML) and deep learning (DL) studies. Fortunately, the current big data era is characterized by technological breakthroughs in ML, DL, and artificial intelligence (AI), which enable data-driven scientific discovery, and many advanced ML and DL technologies developed for big data have inadvertently provided solutions for small data problems. We review both basic machine learning algorithms, such as linear regression, logistic regression (LR), k-nearest neighbor (KNN), support vector machine (SVM), kernel learning (KL), random forest (RF), and gradient boosting trees (GBT), and more advanced techniques, including artificial neural network (ANN), convolutional neural network (CNN), U-Net, graph neural network (GNN), Generative Adversarial Network (GAN), long short-term memory (LSTM), autoencoder, transfo

# Abstractive Summarization

In [35]:
summarizer = pipeline('summarization', model = 't5-base', tokenizer = 't5-base', framework = 'pt')



In [36]:
summary = summarizer(text, max_length = 100, min_length = 10, do_sample = False)

In [37]:
summary

[{'summary_text': 'the small data challenge is often compounded by issues such as data diversity, imputation, noise, imbalance, and high-dimensionality . the current big data era is characterized by technological breakthroughs in ML, DL, and artificial intelligence (AI)'}]

In [39]:
summary[0]['summary_text']

'the small data challenge is often compounded by issues such as data diversity, imputation, noise, imbalance, and high-dimensionality . the current big data era is characterized by technological breakthroughs in ML, DL, and artificial intelligence (AI)'