In [1]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from tqdm import tqdm

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [23]:
data = pd.read_csv('tennis_articles_v4.csv',encoding = 'unicode-escape')
data.head()

Unnamed: 0,article_id,article_text,source
0,1,Maria Sharapova has basically no friends as te...,https://www.tennisworldusa.org/tennis/news/Mar...
1,2,"BASEL, Switzerland (AP), Roger Federer advance...",http://www.tennis.com/pro-game/2018/10/copil-s...
2,3,Roger Federer has revealed that organisers of ...,https://scroll.in/field/899938/tennis-roger-fe...
3,4,Kei Nishikori will try to end his long losing ...,http://www.tennis.com/pro-game/2018/10/nishiko...
4,5,"Federer, 37, first broke through on tour over ...",https://www.express.co.uk/sport/tennis/1036101...


In [24]:
print(f"There are {data.shape[0]} Article in our data")

There are 8 Article in our data


In [4]:
data['article_text'][0]

"Maria Sharapova has basically no friends as tennis players on the WTA Tour. The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much. I think everyone knows this is my job here. When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match. I'm a pretty competitive girl. I say my hellos, but I'm not sending any players flowers as well. Uhm, I'm not really friendly or close to many players. I have not a lot of friends away from the courts.' When she said she is not really close to a lot of players, is that something strategic that she is doing? Is it different on the men's tour than the women's tour? 'No, not at all. I think just because you're in the same 

In [5]:
data['article_text'][1]

"BASEL, Switzerland (AP), Roger Federer advanced to the 14th Swiss Indoors final of his career by beating seventh-seeded Daniil Medvedev 6-1, 6-4 on Saturday. Seeking a ninth title at his hometown event, and a 99th overall, Federer will play 93th-ranked Marius Copil on Sunday. Federer dominated the 20th-ranked Medvedev and had his first match-point chance to break serve again at 5-1. He then dropped his serve to love, and let another match point slip in Medvedev's next service game by netting a backhand. He clinched on his fourth chance when Medvedev netted from the baseline. Copil upset expectations of a Federer final against Alexander Zverev in a 6-3, 6-7 (6), 6-4 win over the fifth-ranked German in the earlier semifinal. The Romanian aims for a first title after arriving at Basel without a career win over a top-10 opponent. Copil has two after also beating No. 6 Marin Cilic in the second round. Copil fired 26 aces past Zverev and never dropped serve, clinching after 2 1/2 hours with

### Split Text into Sentences


In [6]:
sentences = []
for sent in data['article_text']:
    sentences.append(sent_tokenize(sent))

In [7]:
sentences = [y for x in sentences for y in x]

In [8]:
sentences[0]

'Maria Sharapova has basically no friends as tennis players on the WTA Tour.'

In [13]:
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]"," ")
clean_sentences = [s.lower() for s in clean_sentences]

  clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]"," ")


In [14]:
nltk.download('stopwords')# one time execution

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
from nltk.corpus import stopwords

In [16]:
stop_words = stopwords.words('english')

In [18]:
def remove_stop_words(sent):
  sent = " ".join([ word for word in sent if word not in stop_words])
  return sent

In [19]:
clean_sentences = [remove_stop_words(sent.split()) for sent in clean_sentences]

### Download GloVe Word Embeddings


In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

In [20]:
word_embeddings = {}
f = open('glove.6B.100d.txt','r', encoding='utf-8')
for line in tqdm(f):
  values_split = line.split()
  word = values_split[0]
  weights = np.array(values_split[1:], dtype=np.float32)
  word_embeddings[word] = weights
f.close()

400000it [00:13, 30641.68it/s]


In [21]:
word_embeddings["hello"]

array([ 0.26688  ,  0.39632  ,  0.6169   , -0.77451  , -0.1039   ,
        0.26697  ,  0.2788   ,  0.30992  ,  0.0054685, -0.085256 ,
        0.73602  , -0.098432 ,  0.5479   , -0.030305 ,  0.33479  ,
        0.14094  , -0.0070003,  0.32569  ,  0.22902  ,  0.46557  ,
       -0.19531  ,  0.37491  , -0.7139   , -0.51775  ,  0.77039  ,
        1.0881   , -0.66011  , -0.16234  ,  0.9119   ,  0.21046  ,
        0.047494 ,  1.0019   ,  1.1133   ,  0.70094  , -0.08696  ,
        0.47571  ,  0.1636   , -0.44469  ,  0.4469   , -0.93817  ,
        0.013101 ,  0.085964 , -0.67456  ,  0.49662  , -0.037827 ,
       -0.11038  , -0.28612  ,  0.074606 , -0.31527  , -0.093774 ,
       -0.57069  ,  0.66865  ,  0.45307  , -0.34154  , -0.7166   ,
       -0.75273  ,  0.075212 ,  0.57903  , -0.1191   , -0.11379  ,
       -0.10026  ,  0.71341  , -1.1574   , -0.74026  ,  0.40452  ,
        0.18023  ,  0.21449  ,  0.37638  ,  0.11239  , -0.53639  ,
       -0.025092 ,  0.31886  , -0.25013  , -0.63283  , -0.0118

In [27]:
sentence_vectors = []

for sent in clean_sentences:
  if len(sent) !=0 :
    vector = sum([word_embeddings.get(word, np.zeros((100,))) for word in sent.split()]) / (len(sent.split()) + 0.001)
  else:
    vector = np.zeros((100,))

  sentence_vectors.append(vector)

In [28]:
# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])

In [29]:
from sklearn.metrics.pairwise import cosine_similarity

In [33]:
for i in range(len(clean_sentences)):
  for j in range(len(clean_sentences)):
    sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(100,1),
                                      sentence_vectors[j].reshape(100,1))[0,0]


In [34]:
import networkx as nx

In [37]:
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [40]:
scores.get(0)

-0.21096064267908313

In [44]:
ranked_sentences = sorted(((scores[i],s)for i, s in enumerate(clean_sentences)),reverse=True)
ranked_sentences

[(0.21349978787068744, 'victory sunday anderson qualify atp finals'),
 (0.21349978787068744, 'verdasco anderson hit nine aces opening set'),
 (0.21349978787068744,
  'used first break point close first set going second wrapping win first match point'),
 (0.21349978787068744, 'ultimately tennis small part'),
 (0.21349978787068744, 'uhm really friendly close many players'),
 (0.21349978787068744,
  'tuesday fly paris train afternoon ready first match wednesday night'),
 (0.21349978787068744,
  'time grand slam winner chasing th atp title swiss indoors week faces jan lennard struff second round thursday pm bst'),
 (0.21349978787068744, 'time consult people consult'),
 (0.21349978787068744,
  'think really nice environment great atmosphere especially veteran players helping younger players'),
 (0.21349978787068744, 'think misfiring corners hitting lines enough'),
 (0.21349978787068744, 'think every person different interests'),
 (0.21349978787068744, 'think designed anyhow'),
 (0.213499787

In [45]:
# Extract top 10 sentences as the summary
for i in range(10):
  print(ranked_sentences[i][1])


victory sunday anderson qualify atp finals
verdasco anderson hit nine aces opening set
used first break point close first set going second wrapping win first match point
ultimately tennis small part
uhm really friendly close many players
tuesday fly paris train afternoon ready first match wednesday night
time grand slam winner chasing th atp title swiss indoors week faces jan lennard struff second round thursday pm bst
time consult people consult
think really nice environment great atmosphere especially veteran players helping younger players
think misfiring corners hitting lines enough
