## Algorithm

* Convert all sentences to vector quantity
* Compare all sentences between each other

In [1]:
import pandas as pd
import en_core_web_sm
import numpy as np
from utils import load_espn, save_espn
path = r'data/ESPN_football.csv'
df = load_espn(pat

In [2]:
df.head()

Unnamed: 0,author,class,data-id,sport,teamname,timestamp,url,summary,headline,rawText,text,cleanText
0,Mike Rodak,story-link,buffalo-bills-33083,nfl,buffalo-bills,3d,http://espn.com/blog/buffalo-bills/post/_/id/3...,"Andre Reed, who ranks seventh in NFL history w...",Andre Reed has acting bug after cameos in 'Mac...,"Andre Reed, who ranks seventh in NFL history w...","[Andre Reed, who ranks seventh in NFL history ...",[andre reed rank seventh nfl history 85 postse...
1,Todd McShay,story-link,26489910,nfl,buffalo-bills,4d,http://insider.espn.com/nfl/draft2019/insider/...,Todd McShay hears that the Giants might not be...,Todd McShay's top five 2019 NFL draft needs fo...,Todd McShay hears that the Giants might not be...,[Todd McShay hears that the Giants might not b...,[todd mcshay hear giants love dwayne haskins p...
2,ESPN.com,story-link,26473482,nfl,buffalo-bills,6d,http://www.espn.com/nfl/draft2019/story/_/id/2...,"The NFL draft annually brings back memories, a...",NFL draft do-overs: Let's re-pick ... and fix ...,"The NFL draft annually brings back memories, a...","[The NFL draft annually brings back memories, ...",[nfl draft annually bring memory memory exactl...
3,T.J. Berka,story-link,24367000,nfl,buffalo-bills,7d,http://www.espn.com/nhl/story/_/id/24367000/th...,Since we updated the sports misery index in De...,Sports misery index: Most miserable fan bases ...,Since we updated the sports misery index in De...,[Since we updated the sports misery index in D...,[since update sport misery index december foot...
4,Mike Rodak,story-link,26426098,nfl,buffalo-bills,13d,http://www.espn.com/nfl/story/_/id/26426098/in...,Former NFL offensive lineman Richie Incognito ...,Incognito pleads guilty to disorderly conduct,Former NFL offensive lineman Richie Incognito ...,[Former NFL offensive lineman Richie Incognito...,[former nfl offensive lineman richie incognito...


In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

def get_centroid(X):
    return (np.sum(X.A, axis=0)/X.shape[0])

def _get_sentence_vectors(clean_text, norm):
    
    token_pattern = r'(?u)\b[-\w][-\w]+\b'  
    if not norm:
        vectorizer = CountVectorizer(lowercase=False,
                                     preprocessor=None,
                                     token_pattern = token_pattern
                                    )
    else:
        vectorizer = TfidfVectorizer(lowercase=False,
                                     use_idf = False,
                                     norm = 'l1',
                                     preprocessor=None,
                                     token_pattern = token_pattern
                                    )
        
    X = vectorizer.fit_transform(clean_text)
    return X

def _get_knn_result(X, similarity_metric):
    centroid = get_centroid(X)
    nn = NearestNeighbors(n_neighbors=1,
                          metric = similarity_metric
                         )
    nn.fit(X)
    distance, index =  nn.kneighbors([centroid])
    return index[0][0]

def unnormal_cos(clean_text):
    X = _get_sentence_vectors(clean_text, norm=False)
    return _get_knn_result(X, similarity_metric='cosine')

def normal_cos(clean_text):
    X = _get_sentence_vectors(clean_text, norm=True)
    return _get_knn_result(X, similarity_metric='cosine')

def normal_euclid(clean_text):
    X = _get_sentence_vectors(clean_text, norm=True)
    return _get_knn_result(X, similarity_metric='euclid')  
    


In [4]:
unnormal_cos(df.cleanText[0])

5

In [6]:
normal_cos(df.cleanText[0])

14

In [8]:
normal_euclid(df.cleanText[0])

5

In [9]:
def new_centroid_knn (clean_text):
    X = _get_sentence_vectors(clean_text, norm=True)
    
    winning_sent_index = _get_knn_result(X, similarity_metric='euclid') 

    x_array = X.A
    x_array = np.delete(x_array, [winning_sent_index], axis = 0)
    #calculating new centroid WITHOUT winning sentence (use mean)
    new_centroid = (np.sum(x_array, axis=0)/x_array.shape[0]) # can't reuse function because of sparse matrix (X vs np array (X.A))
    nn = NearestNeighbors(n_neighbors = 1,
                     metric = 'euclidean')
    nn.fit(X) # fitting full set of sentences still
    distance, index = nn.kneighbors([new_centroid])
    return index[0][0]

In [10]:
new_centroid_knn(df.cleanText[0])

5

In [11]:
 # I believe this is closer to what it should be
def uc(clean_text):
    return unnormal_cos(clean_text)
df['unnormal_cosine'] = df['cleanText'].apply(uc)

In [12]:
def nc(df):
    df['normal_cosine'] = normal_cos(df['cleanText'])
    return df
df = df.apply(nc, axis=1)

In [13]:
def ne(df):
    df['normal_euclid'] = normal_euclid(df['cleanText'])
    return df
df = df.apply(ne, axis=1)

In [14]:
def ncknn(df):
    df['new_centroid_compare'] = new_centroid_knn(df['cleanText'])
    return df
df = df.apply(ncknn, axis = 1)

In [16]:
save_espn(df, r'data/ESPN_football.csv')