## Preprocessing

Using one article as an example:

* sentence tokenize
* stop word removal
* Stemming (or lemmatization) (no stemming in spaCy)

In [95]:
import pandas as pd
import en_core_web_sm
import spacy

In [114]:
data = pd.read_csv("ESPN_football.csv")

In [115]:
data.head()

Unnamed: 0,author,class,data-id,sport,teamname,timestamp,url,summary,text,headline
0,Michael DiRocco,story-link,26094253,nfl,buffalo-bills,4h,http://www.espn.com/nfl/story/_/id/26094253/ja...,"JACKSONVILLE, Fla. -- The Jacksonville Jaguars...","JACKSONVILLE, Fla. -- The Jacksonville Jaguars...",Jags GM: Fournette 'in a good spot' after meeting
1,Mike Rodak,story-link,buffalo-bills-32911,nfl,buffalo-bills,2d,http://espn.com/blog/buffalo-bills/post/_/id/3...,The cash-flush Buffalo Bills could be among th...,The cash-flush Buffalo Bills could be among th...,Bills' focus on homegrown talent could temper ...
2,Mike Rodak,story-link,buffalo-bills-32889,nfl,buffalo-bills,6d,http://espn.com/blog/buffalo-bills/post/_/id/3...,The Buffalo Bills will be under pressure over ...,The Buffalo Bills will be under pressure over ...,Why Bills should consider trading sacks leader...
3,"Michael C. Wright, Greg Wyshynski, Mike Rodak ...",story-link,25760086,nfl,buffalo-bills,6d,http://www.espn.com/espn/story/_/id/25760086/b...,93-year-old Pete Anton has spent decades worki...,93-year-old Pete Anton has spent decades worki...,Behind-the-scenes game-day jobs you never knew...
4,ESPN.com,story-link,25998951,nfl,buffalo-bills,9d,http://www.espn.com/nfl/story/_/id/25998951/ho...,The five quarterbacks drafted in the first rou...,The five quarterbacks drafted in the first rou...,How the NFL's worst quarterbacks can improve i...


In [116]:
nlp = en_core_web_sm.load()

In [118]:
# remove stopwords (from NLTK), punctuation, pronouns
# lemmatize and tokenize
punctuations = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~.'
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "'re", "'ve", "'ll", "'d", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she", 'her', 'hers', 'herself', 'it', "'s", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "'ll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "'t", 'should', "'ve", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
# must be run once
nlp.add_pipe(nlp.create_pipe('sentencizer'))
def cleanup_text(docs, logging = False):
    sentences = []
    doc = nlp(docs, disable=['parser', 'ner'])
    for sent in doc.sents:
        tokens = [tok.lemma_.lower().strip() for tok in sent if tok.lemma_ != '-PRON-' and tok.is_stop != True] # removing pronouns, lowering, lemmatizing
        tokens = [tok for tok in tokens if tok not in stopwords and tok not in punctuations]
        tokens = ' '.join(tokens)
        sentences.append(tokens)
    return sentences
data['clean_text'] = data['text'].apply(lambda x: cleanup_text(x, False))

In [119]:
data.clean_text[0][:20] # it works!
#clean_text contains tokens (sentences) in an array

['jacksonville florida -- jacksonville jaguar pleased leonard fournette begin offseason eager year player respond disappointing -- time concern -- 2018',
 'general manager dave caldwell coach doug marrone praise fournette nfl scout combine wednesday morning',
 'think leonard good spot caldwell say',
 'know lot end season stuff like good phase',
 'work',
 'know take nutrition workout seriously',
 'think good spot come april rest veteran ota',
 'fournette second nfl season nearly rush 1,040 yard touchdown help jaguars win afc south reach afc championship game',
 'instead 2018 miss seven game injury suspend field issue let shape',
 'jaguars need quarterback use tight end line help receiver draft april',
 'big change new england offseason',
 'jet money spend chase pats',
 'season end sour note executive vp football operation tom coughlin publicly criticize fournette inactive foot injury t.j. yeldon sit bench act disinterested season finale',
 'coughlin say disrespectful selfish behavior un

In [120]:
data.head()

Unnamed: 0,author,class,data-id,sport,teamname,timestamp,url,summary,text,headline,clean_text
0,Michael DiRocco,story-link,26094253,nfl,buffalo-bills,4h,http://www.espn.com/nfl/story/_/id/26094253/ja...,"JACKSONVILLE, Fla. -- The Jacksonville Jaguars...","JACKSONVILLE, Fla. -- The Jacksonville Jaguars...",Jags GM: Fournette 'in a good spot' after meeting,[jacksonville florida -- jacksonville jaguar p...
1,Mike Rodak,story-link,buffalo-bills-32911,nfl,buffalo-bills,2d,http://espn.com/blog/buffalo-bills/post/_/id/3...,The cash-flush Buffalo Bills could be among th...,The cash-flush Buffalo Bills could be among th...,Bills' focus on homegrown talent could temper ...,[cash flush buffalo bills active team nfl free...
2,Mike Rodak,story-link,buffalo-bills-32889,nfl,buffalo-bills,6d,http://espn.com/blog/buffalo-bills/post/_/id/3...,The Buffalo Bills will be under pressure over ...,The Buffalo Bills will be under pressure over ...,Why Bills should consider trading sacks leader...,[buffalo bills pressure upcoming month improve...
3,"Michael C. Wright, Greg Wyshynski, Mike Rodak ...",story-link,25760086,nfl,buffalo-bills,6d,http://www.espn.com/espn/story/_/id/25760086/b...,93-year-old Pete Anton has spent decades worki...,93-year-old Pete Anton has spent decades worki...,Behind-the-scenes game-day jobs you never knew...,[93-year old pete anton spend decade work spur...
4,ESPN.com,story-link,25998951,nfl,buffalo-bills,9d,http://www.espn.com/nfl/story/_/id/25998951/ho...,The five quarterbacks drafted in the first rou...,The five quarterbacks drafted in the first rou...,How the NFL's worst quarterbacks can improve i...,[quarterback draft round april learn job seaso...


In [121]:
holder = data.to_csv(r'C:\Users\atenk\Documents\ISM\HeadlineGeneration\ESPN_football.csv', index=False)

In [81]:

sentences = []
sim = []
s1_idx = []
s2_idx = []
doc = nlp(data.text[0], disable=['parser', 'ner'])
for i, s1 in enumerate(doc.sents):
    for j, s2 in enumerate(doc.sents):
        if not i==j:
            s1_idx.append(i)
            s2_idx.append(j)
            sim.append(s1.similarity(s2))

argmax = np.argmax(sim)
print(argmax)
sent1 = s1_idx[argmax]
sent2 = s2_idx[argmax]
print(sent1)
print(sent2)

sent_list = [sent for sent in doc.sents]
print(sent_list[sent1])
print(sent_list[sent2])   

670
18
23
It's unusual for an NFL player to gain weight during the season, and Fournette was unable to do much, if any, conditioning while he was rehabbing his hamstring.
Fournette also was caught on video yelling that he was going to "beat your ass" at an unknown fan in the stands during the team's embarrassing loss to Tennessee on Dec. 6.


In [87]:
doc2 = nlp(data.text_cleaned[0][0])
for sent in doc2.sents:
    print(sent.vector)
    print(len(sent.vector))
    break

[-9.68799219e-02  5.97739935e-01  4.86726850e-01 -3.40180337e-01
  4.49663222e-01  6.89203680e-01 -2.38108015e+00 -3.05285841e-01
  8.49002063e-01  8.81698430e-01  1.79127860e+00 -1.16209102e+00
 -2.37748906e-01 -3.38391870e-01 -1.59728122e+00 -5.19145906e-01
 -1.77076650e+00 -2.02990961e+00 -1.55280602e+00  1.10423803e+00
  1.43855202e+00  4.21564966e-01  1.30673379e-01  4.36388254e-01
 -5.46732008e-01 -9.71688449e-01  2.01277167e-01  1.55828178e+00
 -2.49524787e-01 -5.71048379e-01 -4.39385802e-01  3.53609622e-02
  7.75099158e-01 -9.34234142e-01 -1.04441726e+00 -1.05945516e+00
  1.19754148e+00 -5.17955780e-01  5.82489610e-01  3.12497374e-02
 -1.06598222e+00  5.05846515e-02  1.47063947e+00 -7.94845521e-02
 -5.11507988e-01  4.22536552e-01 -3.58346045e-01 -1.51737526e-01
  1.76340890e+00 -4.07913119e-01  5.23148656e-01 -1.79974055e+00
 -1.78290617e+00  1.85861588e+00  1.47800952e-01 -7.12824702e-01
  1.68336296e+00  1.58062065e+00 -9.73185718e-01  7.56698012e-01
 -6.91946208e-01  5.76156

In [88]:
for sent in doc.sents:
    print(len(sent.vector))
    print(sent.vector)
    break

128
[ 0.44483843  0.49579698  0.7346214   0.9023633   0.31820202  0.13855854
 -0.6983672  -0.40063557  0.5386324  -0.26303607 -0.48844942 -0.18563779
  0.81426364 -0.88031423 -0.39588866 -0.18343551 -0.11829722 -0.85356605
 -1.0576587  -0.28016677  0.54777443 -0.23409879 -0.20293522  0.09141938
 -0.7065784  -0.08647011 -0.10515936  0.44295126 -0.11221298 -0.17450893
 -0.03083334  0.4228335   0.27318192 -0.54521847 -0.05321183 -0.06819107
  0.7187186  -0.3739111  -0.01182292  0.13135903 -0.32934126  0.65420985
  0.9874981  -0.69826764 -0.32230085  0.82641697  0.3210473   0.2543346
  1.5513906   0.3371386  -0.09676136 -0.8319909  -1.4636451   0.19282573
  0.36887926  0.8287858   0.40869045  0.4960904  -0.41985375 -0.19336313
 -0.02820968  0.3888765  -1.0608032  -0.4394413   0.46368346 -0.5940484
  0.07530005  0.16329034 -0.4198597   0.41991535 -0.12283194 -1.6084993
 -0.26781774  0.3467377  -1.1215436  -0.34200734 -0.4256215  -0.13661967
  0.18586452 -0.09254345 -0.62557065 -0.4221979  -

In [78]:
for idx, sent in enumerate(sents):
    print(idx)
    

In [58]:
import numpy as np

In [53]:
sentences = data.text_cleaned[0]

s1 = sentences[0]
s2 = sentences[1]

for s1 in sentences:
    for s2 in sentences:
        print(s1.text[:3], s2.text[:3],s1.similarity(s2))

AttributeError: 'str' object has no attribute 'text'