In [2]:
import numpy as np
import pandas as pd
import nltk
#nltk.download('punkt') # one time execution
import re

In [3]:
df = pd.read_csv("Dataset/tennis_articles_v4.csv")

In [5]:
#Passage split in the sentences 
from nltk.tokenize import sent_tokenize
sentences = []
for s in df['article_text']:
  sentences.append(sent_tokenize(s))

sentences = [y for x in sentences for y in x] # flatten list
sentences[:5]

['Maria Sharapova has basically no friends as tennis players on the WTA Tour.',
 "The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much.",
 'I think everyone knows this is my job here.',
 "When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.",
 "I'm a pretty competitive girl."]

In [6]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]
clean_sentences

['maria sharapova has basically no friends as tennis players on the wta tour ',
 'the russian player has no problems in openly speaking about it and in a recent interview she said   i don t really hide any feelings too much ',
 'i think everyone knows this is my job here ',
 'when i m on the courts or when i m on the court playing  i m a competitor and i want to beat every single person whether they re in the locker room or across the net so i m not the one to strike up a conversation about the weather and know that in the next few minutes i have to go and try to win a tennis match ',
 'i m a pretty competitive girl ',
 'i say my hellos  but i m not sending any players flowers as well ',
 'uhm  i m not really friendly or close to many players ',
 'i have not a lot of friends away from the courts  ',
 'when she said she is not really close to a lot of players  is that something strategic that she is doing ',
 'is it different on the men s tour than the women s tour ',
 ' no  not at all 

In [7]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]

In [8]:
#nltk.download('stopwords') one time execution
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [9]:
# function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [10]:
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

In [11]:
from sklearn.feature_extraction.text import CountVectorizer    
count_vect = CountVectorizer(max_features=5000)
sentence_vectors = count_vect.fit_transform(clean_sentences)
print(sentence_vectors)

  (0, 571)	1
  (0, 625)	1
  (0, 419)	1
  (0, 548)	1
  (0, 223)	1
  (0, 41)	1
  (0, 495)	1
  (0, 336)	1
  (1, 368)	1
  (1, 190)	1
  (1, 253)	1
  (1, 455)	1
  (1, 472)	1
  (1, 277)	1
  (1, 457)	1
  (1, 518)	1
  (1, 396)	1
  (1, 434)	1
  (1, 418)	1
  (1, 471)	1
  (2, 285)	1
  (2, 297)	1
  (2, 174)	1
  (2, 553)	1
  (3, 341)	1
  :	:
  (115, 163)	1
  (115, 303)	1
  (115, 565)	1
  (115, 325)	1
  (115, 579)	1
  (115, 296)	1
  (115, 368)	1
  (116, 71)	1
  (116, 344)	1
  (116, 303)	1
  (116, 565)	1
  (117, 31)	1
  (117, 181)	1
  (117, 401)	1
  (117, 521)	1
  (117, 36)	1
  (117, 482)	1
  (117, 570)	1
  (117, 363)	1
  (117, 80)	1
  (117, 179)	1
  (118, 154)	1
  (118, 14)	1
  (118, 609)	1
  (118, 303)	1


In [13]:
sim_mat = np.zeros([len(sentences), len(sentences)])

from sklearn.metrics.pairwise import cosine_similarity
for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
       sim_mat[i][j] = cosine_similarity(sentence_vectors[i], sentence_vectors[j])[0,0]

In [14]:
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [15]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

In [16]:
# Extract top 10 sentences as the summary
for i in range(10):
  print(ranked_sentences[i][1])

Federer won the Swiss Indoors last week by beating Romanian qualifier Marius Copil in the final.
"I didn't serve very well [against first-round opponent Filip Kranjovic," Federer said.
I think everyone just thinks because we're tennis players we should be the greatest of friends.
Federer's projected route to the Paris final could also lead to matches against Kevin Anderson and Novak Djokovic.
Federer dominated the 20th-ranked Medvedev and had his first match-point chance to break serve again at 5-1.
Two players, Stefanos Tsitsipas and Kyle Edmund, won their first career ATP titles last week (13:26).
I think just because you're in the same sport doesn't mean that you have to be friends with everyone just because you're categorized, you're a tennis player, so you're going to get along with tennis players.
Federer's success in Basel last week was the ninth time he has won his hometown tournament.
The 20-time Grand Slam winner is chasing his 99th ATP title at the Swiss Indoors this week an