# NATURAL LANGUAGE PROCESSING

## TEXT SUMMARIZATION

In [1]:
#importing the required libraries
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

In [2]:
#text that needs to be summarized
text = """
 Maria Sharapova has basically no friends as tennis players on the WTA Tour. The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much.
 I think everyone knows this is my job here. When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.
 So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.
 I'm a pretty competitive girl. I say my hellos, but I'm not sending any players flowers as well. Uhm, I'm not really friendly or close to many players.
 I have not a lot of friends away from the courts.' When she said she is not really close to a lot of players, is that something strategic that she is doing? Is it different on the men's tour than the women's tour? 'No, not at all.
 I think just because you're in the same sport doesn't mean that you have to be friends with everyone just because you're categorized, you're a tennis player, so you're going to get along with tennis players.
 I think every person has different interests. I have friends that have completely different jobs and interests, and I've met them in very different parts of my life.
 I think everyone just thinks because we're tennis players we should be the greatest of friends. But ultimately tennis is just a very small part of what we do.
 There are so many other things that we're interested in, that we do.'
 """

In [3]:
#words that don't help much in the process of nlp and increase the time taken
STOP_WORDS

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [4]:
stopwords=list(STOP_WORDS)

In [5]:
#loading the small module of en_core_web 
nlp=spacy.load("en_core_web_sm")

In [6]:
doc=nlp(text)

In [7]:
#adding "\n to punctuation
punctuation=punctuation + "\n "
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\n '

In [8]:
#Word tokenization
#list comprehension
tokens=[token.text.lower() for token in doc
        if not token.text.lower() in stopwords and
        not token.text.lower() in punctuation]

In [9]:
tokens

['maria',
 'sharapova',
 'basically',
 'friends',
 'tennis',
 'players',
 'wta',
 'tour',
 'russian',
 'player',
 'problems',
 'openly',
 'speaking',
 'recent',
 'interview',
 'said',
 'hide',
 'feelings',
 'think',
 'knows',
 'job',
 'courts',
 'court',
 'playing',
 'competitor',
 'want',
 'beat',
 'single',
 'person',
 'locker',
 'room',
 'net',
 'strike',
 'conversation',
 'weather',
 'know',
 'minutes',
 'try',
 'win',
 'tennis',
 'match',
 'pretty',
 'competitive',
 'girl',
 'hellos',
 'sending',
 'players',
 'flowers',
 'uhm',
 'friendly',
 'close',
 'players',
 'lot',
 'friends',
 'away',
 'courts',
 'said',
 'close',
 'lot',
 'players',
 'strategic',
 'different',
 'men',
 'tour',
 'women',
 'tour',
 'think',
 'sport',
 'mean',
 'friends',
 'categorized',
 'tennis',
 'player',
 'going',
 'tennis',
 'players',
 'think',
 'person',
 'different',
 'interests',
 'friends',
 'completely',
 'different',
 'jobs',
 'interests',
 'met',
 'different',
 'parts',
 'life',
 'think',
 'think

In [10]:
#joining the tokens in the list for readability
" ".join(tokens)

'maria sharapova basically friends tennis players wta tour russian player problems openly speaking recent interview said hide feelings think knows job courts court playing competitor want beat single person locker room net strike conversation weather know minutes try win tennis match pretty competitive girl hellos sending players flowers uhm friendly close players lot friends away courts said close lot players strategic different men tour women tour think sport mean friends categorized tennis player going tennis players think person different interests friends completely different jobs interests met different parts life think thinks tennis players greatest friends ultimately tennis small things interested'

In [11]:
#importing counter
from collections import Counter

In [12]:
#counting the frequency of the words
word_freq=Counter(tokens)
word_freq

Counter({'maria': 1,
         'sharapova': 1,
         'basically': 1,
         'friends': 5,
         'tennis': 6,
         'players': 6,
         'wta': 1,
         'tour': 3,
         'russian': 1,
         'player': 2,
         'problems': 1,
         'openly': 1,
         'speaking': 1,
         'recent': 1,
         'interview': 1,
         'said': 2,
         'hide': 1,
         'feelings': 1,
         'think': 4,
         'knows': 1,
         'job': 1,
         'courts': 2,
         'court': 1,
         'playing': 1,
         'competitor': 1,
         'want': 1,
         'beat': 1,
         'single': 1,
         'person': 2,
         'locker': 1,
         'room': 1,
         'net': 1,
         'strike': 1,
         'conversation': 1,
         'weather': 1,
         'know': 1,
         'minutes': 1,
         'try': 1,
         'win': 1,
         'match': 1,
         'pretty': 1,
         'competitive': 1,
         'girl': 1,
         'hellos': 1,
         'sending': 1,
         

In [13]:
#finding out the maximum word frequency
max_word_freq=max(word_freq.values())
max_word_freq

6

In [14]:
#normalizing the word frequency
for word in word_freq.keys():
    word_freq[word]=word_freq[word]/max_word_freq

In [15]:
#normalized word frequency
#all the values lie between 0 and 1
word_freq

Counter({'maria': 0.16666666666666666,
         'sharapova': 0.16666666666666666,
         'basically': 0.16666666666666666,
         'friends': 0.8333333333333334,
         'tennis': 1.0,
         'players': 1.0,
         'wta': 0.16666666666666666,
         'tour': 0.5,
         'russian': 0.16666666666666666,
         'player': 0.3333333333333333,
         'problems': 0.16666666666666666,
         'openly': 0.16666666666666666,
         'speaking': 0.16666666666666666,
         'recent': 0.16666666666666666,
         'interview': 0.16666666666666666,
         'said': 0.3333333333333333,
         'hide': 0.16666666666666666,
         'feelings': 0.16666666666666666,
         'think': 0.6666666666666666,
         'knows': 0.16666666666666666,
         'job': 0.16666666666666666,
         'courts': 0.3333333333333333,
         'court': 0.16666666666666666,
         'playing': 0.16666666666666666,
         'competitor': 0.16666666666666666,
         'want': 0.16666666666666666,
        

In [16]:
#sentence tokenization
#list comprehension
sent_tokens=[sent for sent in doc.sents]
sent_tokens

[
  Maria Sharapova has basically no friends as tennis players on the WTA Tour.,
 The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much.
  ,
 I think everyone knows this is my job here.,
 When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.
  ,
 So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.
  ,
 I'm a pretty competitive girl.,
 I say my hellos, but I'm not sending any players flowers as well.,
 Uhm, I'm not really friendly or close to many players.
  ,
 I have not a lot of friends away from the courts.',
 When she said she is not really close to a lot of players, is that something strategic that she is doing?,
 Is it different on the men's tour than the women's tour?,
 ',
 No, not at all.

In [17]:
#Word frequency table:
sentence_scores = {}
for sent in sent_tokens:
  for word in sent:
    if word.text.lower() in word_freq.keys():
      if sent not in sentence_scores.keys():
        sentence_scores[sent]=word_freq[word.text.lower()]
      else:
        sentence_scores[sent]+=word_freq[word.text.lower()]

In [18]:
sentence_scores

{
  Maria Sharapova has basically no friends as tennis players on the WTA Tour.: 4.0,
 The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much.
  : 2.0,
 I think everyone knows this is my job here.: 0.9999999999999999,
 When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.
  : 2.1666666666666665,
 So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.
  : 2.333333333333333,
 I'm a pretty competitive girl.: 0.5,
 I say my hellos, but I'm not sending any players flowers as well.: 1.5,
 Uhm, I'm not really friendly or close to many players.
  : 1.6666666666666665,
 I have not a lot of friends away from the courts.': 1.6666666666666667,
 When she said she is not really close to a lot of players, is that 

In [19]:
#importing pandas
import pandas as pd

In [20]:
#craeting a dataframe to increase the readability of the sentence scores
pd.DataFrame(list(sentence_scores.items()),columns=["sentence","scores"])

Unnamed: 0,sentence,scores
0,"(\n , Maria, Sharapova, has, basically, no, fr...",4.0
1,"(The, Russian, player, has, no, problems, in, ...",2.0
2,"(I, think, everyone, knows, this, is, my, job,...",1.0
3,"(When, I, 'm, on, the, courts, or, when, I, 'm...",2.166667
4,"(So, I, 'm, not, the, one, to, strike, up, a, ...",2.333333
5,"(I, 'm, a, pretty, competitive, girl, .)",0.5
6,"(I, say, my, hellos, ,, but, I, 'm, not, sendi...",1.5
7,"(Uhm, ,, I, 'm, not, really, friendly, or, clo...",1.666667
8,"(I, have, not, a, lot, of, friends, away, from...",1.666667
9,"(When, she, said, she, is, not, really, close,...",2.166667


In [21]:
#SUMMARIZATION
from heapq import nlargest

In [22]:
select_length = int(len(sent_tokens)*0.3)
select_length

6

In [23]:
summary = nlargest(select_length, sentence_scores, key = sentence_scores.get)
summary

[I think just because you're in the same sport doesn't mean that you have to be friends with everyone just because you're categorized, you're a tennis player, so you're going to get along with tennis players.
  ,
 
  Maria Sharapova has basically no friends as tennis players on the WTA Tour.,
 I think everyone just thinks because we're tennis players we should be the greatest of friends.,
 I have friends that have completely different jobs and interests, and I've met them in very different parts of my life.
  ,
 So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.
  ,
 When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.
  ]

In [24]:
final_summary=[word.text for word in summary]
summary=" ".join(final_summary)

In [25]:
#summarized text
summary

"I think just because you're in the same sport doesn't mean that you have to be friends with everyone just because you're categorized, you're a tennis player, so you're going to get along with tennis players.\n  \n Maria Sharapova has basically no friends as tennis players on the WTA Tour. I think everyone just thinks because we're tennis players we should be the greatest of friends. I have friends that have completely different jobs and interests, and I've met them in very different parts of my life.\n  So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.\n  When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.\n "

In [26]:
#original text
text

"\n Maria Sharapova has basically no friends as tennis players on the WTA Tour. The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much.\n I think everyone knows this is my job here. When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.\n So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.\n I'm a pretty competitive girl. I say my hellos, but I'm not sending any players flowers as well. Uhm, I'm not really friendly or close to many players.\n I have not a lot of friends away from the courts.' When she said she is not really close to a lot of players, is that something strategic that she is doing? Is it different on the men's tour than the women's tour? 'No, not at all.\n I think just because you'r