In [5]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt') # one time execution
import re

[nltk_data] Downloading package punkt to /Users/anqitu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## 1. Load Data

In [6]:
df = pd.read_csv("corpus/tennis_articles.csv")

In [7]:
df.head()

Unnamed: 0,article_id,article_text,source
0,1,Maria Sharapova has basically no friends as te...,https://www.tennisworldusa.org/tennis/news/Mar...
1,2,"BASEL, Switzerland (AP), Roger Federer advance...",http://www.tennis.com/pro-game/2018/10/copil-s...
2,3,Roger Federer has revealed that organisers of ...,https://scroll.in/field/899938/tennis-roger-fe...
3,4,Kei Nishikori will try to end his long losing ...,http://www.tennis.com/pro-game/2018/10/nishiko...
4,5,"Federer, 37, first broke through on tour over ...",https://www.express.co.uk/sport/tennis/1036101...


In [9]:
df['article_text'][0]

"Maria Sharapova has basically no friends as tennis players on the WTA Tour. The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much. I think everyone knows this is my job here. When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match. I'm a pretty competitive girl. I say my hellos, but I'm not sending any players flowers as well. Uhm, I'm not really friendly or close to many players. I have not a lot of friends away from the courts.' When she said she is not really close to a lot of players, is that something strategic that she is doing? Is it different on the men's tour than the women's tour? 'No, not at all. I think just because you're in the same 

In [14]:
from nltk.tokenize import sent_tokenize
sentences = []
for s in df['article_text']:
    sentences += sent_tokenize(s)

In [15]:
sentences

['Maria Sharapova has basically no friends as tennis players on the WTA Tour.',
 "The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much.",
 'I think everyone knows this is my job here.',
 "When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.",
 "I'm a pretty competitive girl.",
 "I say my hellos, but I'm not sending any players flowers as well.",
 "Uhm, I'm not really friendly or close to many players.",
 "I have not a lot of friends away from the courts.'",
 'When she said she is not really close to a lot of players, is that something strategic that she is doing?',
 "Is it different on the men's tour than the women's tour?",
 "'No, not at all.

## Preprocess Data

In [21]:
from nltk.corpus import stopwords
from nltk import word_tokenize
stop_words = stopwords.words("english")
def is_ok(token):
    return re.match('^[a-z]+$', token) and token not in stop_words
def clean_sentence(sentence):
    return [word for word in word_tokenize(sentence.lower()) if is_ok(word)]

In [36]:
clean_sentence(sentences[0])

['maria',
 'sharapova',
 'basically',
 'friends',
 'tennis',
 'players',
 'wta',
 'tour']

In [37]:
clean_sentences = [clean_sentence(sentence) for sentence in sentences]

In [24]:
from gensim.models import KeyedVectors
filename = "GoogleNews-vectors-negative300.bin"
embedding = KeyedVectors.load_word2vec_format(filename, binary=True)

In [38]:
embedding['man']
np.zeros((300,))

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [45]:
def get_word_vector(word):
    try:
        return embedding[word]
    except:
        return np.zeros((300,))

In [49]:
# get_word_vector('man')

In [55]:
def get_sentence_vector(sentence):
    if len(sentence) != 0:
        word_vector_sum = sum([get_word_vector(word) for word in sentence])
        return word_vector_sum/len(sentence)
    return np.zeros((300,))

In [54]:
# get_sentence_vector(sentences[0])

In [57]:
sentence_vectors = [get_sentence_vector(sentence) for sentence in clean_sentences]

In [58]:
sentence_vectors

[array([-0.02557373, -0.0333252 ,  0.03839874,  0.09300995, -0.00415421,
         0.01013184, -0.03755188, -0.15591431,  0.04277039,  0.03239441,
         0.06240845, -0.10488129, -0.00135326, -0.03339767, -0.01351929,
         0.04473114,  0.09985352,  0.15835571,  0.02529526, -0.00785065,
        -0.00621414,  0.1340332 , -0.05952454, -0.05495834,  0.00450325,
        -0.0736084 , -0.08880615,  0.11761475,  0.0891304 , -0.02322388,
         0.02075195,  0.05579162, -0.02895451, -0.02131653, -0.03224182,
        -0.05862045,  0.04273415, -0.04759979,  0.08563232,  0.03982544,
         0.06498718, -0.06969261,  0.09661865, -0.00241661,  0.04862976,
        -0.1085968 ,  0.05037689, -0.12081909, -0.03590393,  0.12060547,
         0.01776123,  0.09869385, -0.01251221,  0.04114532,  0.03023529,
        -0.02038574, -0.07728577, -0.03573608, -0.02867126, -0.11108398,
        -0.11520195,  0.07748413, -0.06407166, -0.02647018, -0.041327  ,
         0.05786133,  0.04084015,  0.01022339,  0.0

## Similarity Matrix Preparation

In [64]:
from sklearn.metrics.pairwise import cosine_similarity

In [66]:
# sentence_vectors[0].reshape(1,-1)

In [69]:
cosine_similarity(sentence_vectors[0].reshape(1,-1),sentence_vectors[1].reshape(1,-1))

array([[0.47360238]])

In [73]:
def cal_cos_sim(sentence_vector1, sentence_vector2):
    return cosine_similarity(sentence_vector1.reshape(1,-1),sentence_vector2.reshape(1,-1))[0][0]

In [74]:
cal_cos_sim(sentence_vectors[0], sentence_vectors[1])

0.4736023752140091

In [67]:
sim_mat = np.zeros([len(sentences), len(sentences)])

In [75]:
for i in range(len(sentences)):
    for j in range(len(sentences)):
        if i != j:
            sim_mat[i][j] = cal_cos_sim(sentence_vectors[i], sentence_vectors[j])

## Applying PageRank Algorithm

In [76]:
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [79]:
# scores

In [84]:
sentence_scores = [(scores[i],s) for i,s in enumerate(sentences)]
sentence_scores

[(0.008430560782176615,
  'Maria Sharapova has basically no friends as tennis players on the WTA Tour.'),
 (0.008277735614673834,
  "The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much."),
 (0.008091799925889454, 'I think everyone knows this is my job here.'),
 (0.010516167119469877,
  "When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match."),
 (0.007364453945133579, "I'm a pretty competitive girl."),
 (0.007225629774569331,
  "I say my hellos, but I'm not sending any players flowers as well."),
 (0.009313029709688175,
  "Uhm, I'm not really friendly or close to many players."),
 (0.00750861092265802, "I have not a lot of friends away from the 

In [87]:
ranked_sentences = sorted(sentence_scores, reverse=True)
ranked_sentences

[(0.010516167119469877,
  "When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match."),
 (0.010262652786438639,
  'Speaking at the Swiss Indoors tournament where he will play in Sundays final against Romanian qualifier Marius Copil, the world number three said that given the impossibly short time frame to make a decision, he opted out of any commitment.'),
 (0.010141756916913726,
  'Major players feel that a big event in late November combined with one in January before the Australian Open will mean too much tennis and too little rest.'),
 (0.010077819273090738,
  '"I felt like the best weeks that I had to get to know players when I was playing were the Fed Cup weeks or the Olympic weeks, not necessarily during the tournaments.'),
 (0.

In [88]:
for i in range(10):
    print(ranked_sentences[i][1])

When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.
Speaking at the Swiss Indoors tournament where he will play in Sundays final against Romanian qualifier Marius Copil, the world number three said that given the impossibly short time frame to make a decision, he opted out of any commitment.
Major players feel that a big event in late November combined with one in January before the Australian Open will mean too much tennis and too little rest.
"I felt like the best weeks that I had to get to know players when I was playing were the Fed Cup weeks or the Olympic weeks, not necessarily during the tournaments.
Kei Nishikori will try to end his long losing streak in ATP finals and Kevin Anderson will go for his second title of the ye