## Algorithm

* Convert all sentences to vector quantity
* Compare all sentences between each other

In [1]:
import pandas as pd
import en_core_web_sm
import numpy as np
nlp = en_core_web_sm.load()
data = pd.read_csv("ESPN_football.csv")
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [2]:
#two sentences that are most "alike" to each other based on similarity vectors
sentences = []
sim = []
s1_idx = []
s2_idx = []
#clean text or regular text
doc = nlp(data.text[0], disable=['parser', 'ner'])
for i, s1 in enumerate(doc.sents):
    for j, s2 in enumerate(doc.sents):
        if not i==j:
            s1_idx.append(i)
            s2_idx.append(j)
            sim.append(s1.similarity(s2))

argmax = np.argmax(sim)
print(argmax)
sent1 = s1_idx[argmax]
sent2 = s2_idx[argmax]
print(sent1)
print(sent2)

sent_list = [sent for sent in doc.sents]
print(sent_list[sent1])
print(sent_list[sent2])   

670
18
23
It's unusual for an NFL player to gain weight during the season, and Fournette was unable to do much, if any, conditioning while he was rehabbing his hamstring.
Fournette also was caught on video yelling that he was going to "beat your ass" at an unknown fan in the stands during the team's embarrassing loss to Tennessee on Dec. 6.


In [42]:
data.head()

Unnamed: 0,author,class,data-id,sport,teamname,timestamp,url,summary,text,headline,clean_text,unnormal_cosine,normal_cosine,normal_euclid,new_centroid_compare
0,Michael DiRocco,story-link,26094253,nfl,buffalo-bills,4h,http://www.espn.com/nfl/story/_/id/26094253/ja...,"JACKSONVILLE, Fla. -- The Jacksonville Jaguars...","JACKSONVILLE, Fla. -- The Jacksonville Jaguars...",Jags GM: Fournette 'in a good spot' after meeting,['jacksonville florida -- jacksonville jaguar ...,marrone fournette clear air meeting season end...,work,season end sour note executive vp football ope...,season end sour note executive vp football ope...
1,Mike Rodak,story-link,buffalo-bills-32911,nfl,buffalo-bills,2d,http://espn.com/blog/buffalo-bills/post/_/id/3...,The cash-flush Buffalo Bills could be among th...,The cash-flush Buffalo Bills could be among th...,Bills' focus on homegrown talent could temper ...,['cash flush buffalo bills active team nfl fre...,white bills 2017 round pick cornerback eligibl...,white bills 2017 round pick cornerback eligibl...,falcons quinn good defensive mind • rams want ...,falcons quinn good defensive mind • rams want ...
2,Mike Rodak,story-link,buffalo-bills-32889,nfl,buffalo-bills,6d,http://espn.com/blog/buffalo-bills/post/_/id/3...,The Buffalo Bills will be under pressure over ...,The Buffalo Bills will be under pressure over ...,Why Bills should consider trading sacks leader...,['buffalo bills pressure upcoming month improv...,let hughes leave free agency 2019 season bills...,let hughes leave free agency 2019 season bills...,hughes contract situation leave bills multiple...,hughes contract situation leave bills multiple...
3,"Michael C. Wright, Greg Wyshynski, Mike Rodak ...",story-link,25760086,nfl,buffalo-bills,6d,http://www.espn.com/espn/story/_/id/25760086/b...,93-year-old Pete Anton has spent decades worki...,93-year-old Pete Anton has spent decades worki...,Behind-the-scenes game-day jobs you never knew...,['93-year old pete anton spend decade work spu...,hour work game say --,hour work game say --,,
4,ESPN.com,story-link,25998951,nfl,buffalo-bills,9d,http://www.espn.com/nfl/story/_/id/25998951/ho...,The five quarterbacks drafted in the first rou...,The five quarterbacks drafted in the first rou...,How the NFL's worst quarterbacks can improve i...,['quarterback draft round april learn job seas...,jet coach adam gase experience eight season 20...,improve 2019 start,experience five season 2019 salary 20 million ...,experience five season 2019 salary 20 million ...


In [29]:
def unnormal_cos (clean_text):
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.neighbors import NearestNeighbors
    example = eval(clean_text)
    vectorizer = CountVectorizer(lowercase=False,
                                 preprocessor=None,
                                 token_pattern = r'(?u)\b[-\w][-\w]+\b'
                                )
    X = vectorizer.fit_transform(example)
    centroid = (np.sum(X.A, axis=0)/X.shape[0])
    nn = NearestNeighbors(n_neighbors=1,
                          metric = 'cosine' #'euclidean'
                         )
    nn.fit(X)
    distance, index =  nn.kneighbors([centroid])
    return example[index[0][0]]

In [45]:
def normal_cos (clean_text):
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.neighbors import NearestNeighbors
    example = eval(clean_text)
    vectorizer = TfidfVectorizer(lowercase=False,
                                 use_idf = False,
                                 norm = 'l1',
                                 preprocessor=None,
                                 token_pattern = r'(?u)\b[-\w][-\w]+\b'
                                )
    X = vectorizer.fit_transform(example)
    centroid = (np.sum(X.A, axis=0)/X.shape[0])
    nn = NearestNeighbors(n_neighbors = 1,
                         metric = 'cosine')
    nn.fit(X)
    distance, index = nn.kneighbors([centroid])
    return example[index[0][0]]

In [46]:
def normal_euclid (clean_text):
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.neighbors import NearestNeighbors    
    example = eval(clean_text)
    vectorizer = TfidfVectorizer(lowercase=False,
                                 use_idf = False,
                                 norm = 'l1',
                                 preprocessor=None,
                                 token_pattern = r'(?u)\b[-\w][-\w]+\b'
                                )
    X = vectorizer.fit_transform(example)
    centroid = (np.sum(X.A, axis=0)/X.shape[0])
    nn = NearestNeighbors(n_neighbors = 1,
                     metric = 'euclidean')
    nn.fit(X)
    distance, index = nn.kneighbors([centroid])
    return example[index[0][0]]

In [50]:
unnormal_cos(data.clean_text[4])

'jet coach adam gase experience eight season 2019 salary 18 million big issue 2018 last season time keenum enter unquestioned starter team'

In [51]:
normal_cos(data.clean_text[4])

'improve 2019 start'

In [64]:
normal_euclid(data.clean_text[4])

'experience five season 2019 salary 20 million big issue 2018 choose leaky offensive line see carr sack career high 51 time take 36 sack combine 2016 2017 speedy playmaker receiver inconsistent amari cooper trade october 22 steep learning curve jon gruden future offense dink dunk bomb'

In [53]:
def new_centroid_knn (clean_text):
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.neighbors import NearestNeighbors    
    example = eval(clean_text)
    vectorizer = TfidfVectorizer(lowercase=False,
                                 use_idf = False,
                                 norm = 'l1',
                                 preprocessor=None,
                                 token_pattern = r'(?u)\b[-\w][-\w]+\b'
                                )
    X = vectorizer.fit_transform(example)
    centroid = (np.sum(X.A, axis=0)/X.shape[0])
    nn = NearestNeighbors(n_neighbors = 1,
                     metric = 'euclidean')
    nn.fit(X)
    distance, index = nn.kneighbors([centroid])
    
    winning_sent_index = index[0][0]
    x_array = X.A
    x_array = np.delete(x_array, [winning_sent_index], axis = 0)
    
    #calculating new centroid WITHOUT winning sentence (use mean)
    new_centroid = (np.sum(x_array, axis=0)/x_array.shape[0])
    nn = NearestNeighbors(n_neighbors = 1,
                     metric = 'euclidean')

    nn.fit(X)
    distance, index = nn.kneighbors([new_centroid])
    return example[index[0][0]]

In [65]:
new_centroid_knn(data.clean_text[4])

'experience five season 2019 salary 20 million big issue 2018 choose leaky offensive line see carr sack career high 51 time take 36 sack combine 2016 2017 speedy playmaker receiver inconsistent amari cooper trade october 22 steep learning curve jon gruden future offense dink dunk bomb'

In [90]:
def uc(df):
    df['unnormal_cosine'] = unnormal_cos(df['clean_text'])
    return df
data = data.apply(uc, axis=1)

In [91]:
def nc(df):
    df['normal_cosine'] = normal_cos(df['clean_text'])
    return df
data = data.apply(nc, axis=1)

In [92]:
def ne(df):
    df['normal_euclid'] = normal_euclid(df['clean_text'])
    return df
data = data.apply(ne, axis=1)

In [94]:
def nc(df):
    df['new_centroid_compare'] = new_centroid_knn(df['clean_text'])
    return df
data = data.apply(nc, axis = 1)

In [97]:
data.to_csv(r'C:\Users\atenk\Documents\ISM\HeadlineGeneration\ESPN_football.csv', index=False)