# Libraries

In [12]:
import spacy 
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

nlp = spacy.load('en_core_web_sm')



# Text Summarization 

In [82]:
text = """Data science is an inter-disciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from many structural and unstructured data. 
Data science is related to data mining, machine learning and big data. Data science is a concept to unify statistics, data analysis and their related methods in order to understand and analyze actual phenomena with data.
It uses techniques and theories drawn from many fields within the context of mathematics, statistics, computer science, domain knowledge and information science. 
Turing award winner Jim Gray imagined data science as a fourth paradigm of science (empirical, theoretical, computational and now data-driven) and asserted that everything about science is changing because of the impact of information technology and the data deluge.
Data science is an interdisciplinary field focused on extracting knowledge from data sets, which are typically large. The field encompasses analysis, preparing data for analysis, and presenting findings to inform high-level decisions in an organization. 
As such, it incorporates skills from computer science, mathematics, statistics, information visualization, graphic design, complex systems, communication and business.
Many statisticians, including Nate Silver, have argued that data science is not a new field, but rather another name for statistics. Others argue that data science is distinct from statistics because it focuses on problems and techniques unique to digital data.
Vasant Dhar writes that statistics emphasizes quantitative data and description.
In contrast, data science deals with quantitative and qualitative data) and emphasizes prediction and action.
"""


In [83]:
len(text)

1714

In [84]:
doc = nlp(text)

In [85]:
tokens = [token.text for token in doc]

In [86]:
punctuation = punctuation + '\n'

In [87]:
#tokens

# Text Cleaning

word frequency counter

In [88]:
word_freq = {}

In [89]:
stop_words = list(STOP_WORDS)

In [90]:
# Word frequency counter 
word_freq = {}
for word in doc:
    if word.text.lower() not in stopwords:
        if word.text.lower() not in punctuation:
            if word.text not in word_freq.keys():
                word_freq[word.text] = 1
            else:
                word_freq[word.text] += 1
    

In [91]:
max_freq = max(word_freq.values())

In [92]:
# Normalize
for word in word_freq.keys():
    word_freq[word] = word_freq[word] / max_freq

# Sentence tokenization

based on the word frequency counter we will now find the imortant sentences 

In [98]:
sent_tokens = [sent for sent in doc.sents]
print(sent_tokens)

[Data science is an inter-disciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from many structural and unstructured data. 
, Data science is related to data mining, machine learning and big data., Data science is a concept to unify statistics, data analysis and their related methods in order to understand and analyze actual phenomena with data.
, It uses techniques and theories drawn from many fields within the context of mathematics, statistics, computer science, domain knowledge and information science. 
, Turing award winner Jim Gray imagined data science as a fourth paradigm of science (empirical, theoretical, computational and now data-driven) and asserted that everything about science is changing because of the impact of information technology and the data deluge.
, Data science is an interdisciplinary field focused on extracting knowledge from data sets, which are typically large., The field encompasses analysis, p

In [100]:
# Sentence importance
sent_score = {}

for sent in sent_tokens:
    for word in sent:
        if word.text.lower() in word_freq.keys():
            if sent not in sent_score.keys():
                sent_score[sent] = word_freq[word.text.lower()]
            else: 
                sent_score[sent] += word_freq[word.text.lower()]
                
            
            
    



In [102]:
print(sent_score)

{Data science is an inter-disciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from many structural and unstructured data. 
: 4.1875, Data science is related to data mining, machine learning and big data.: 4.1875, Data science is a concept to unify statistics, data analysis and their related methods in order to understand and analyze actual phenomena with data.
: 5.0625, It uses techniques and theories drawn from many fields within the context of mathematics, statistics, computer science, domain knowledge and information science. 
: 3.1875, Turing award winner Jim Gray imagined data science as a fourth paradigm of science (empirical, theoretical, computational and now data-driven) and asserted that everything about science is changing because of the impact of information technology and the data deluge.
: 6.5, Data science is an interdisciplinary field focused on extracting knowledge from data sets, which are typically larg

# Selecting 30% sentences with maximum score

In [103]:
from heapq import nlargest

In [107]:
len(sent_score) * 0.3
# we'll try to implement 4 of the best sentences

3.5999999999999996

# Getting the summary 

In [111]:
summary = nlargest(n = 4, iterable = sent_score, key = sent_score.get)

In [113]:
final_summary = [word.text for word in summary]

In [114]:
print(final_summary)

['Turing award winner Jim Gray imagined data science as a fourth paradigm of science (empirical, theoretical, computational and now data-driven) and asserted that everything about science is changing because of the impact of information technology and the data deluge.\n', 'Data science is a concept to unify statistics, data analysis and their related methods in order to understand and analyze actual phenomena with data.\n', 'Data science is an inter-disciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from many structural and unstructured data. \n', 'Data science is related to data mining, machine learning and big data.']


In [116]:
summary = ' '.join(final_summary)

In [120]:
# ca. 40% of the data were able to summarized
len(summary)/len(text)

0.39498249708284716