# EDA Notebook

In [67]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from collections import Counter
from textblob import TextBlob, Word
import nltk
from nltk.corpus import stopwords
import string
import spacy

In [2]:
%matplotlib inline

In [3]:
articles_list = pd.read_csv('nyt_article_list.csv')

In [4]:
articles_list['title'] = articles_list.article_urls.map(lambda x : x.split('/')[-1].split('.')[0])

In [5]:
articles_list.head()

Unnamed: 0,published_date,article_urls,article_summary,article_headline,title
0,2018-06-09T17:31:27+0000,https://www.nytimes.com/2018/06/09/sports/nba-...,"Accused of making the sport uncompetitive, the...",The Warriors Were Dominant. But How Dominant?,nba-finals-sweep
1,2018-06-09T01:26:37+0000,https://www.nytimes.com/2018/06/08/movies/kyri...,"In his most extensive comments to date, the Bo...",Kyrie Irving Doesn’t Know if the Earth Is Roun...,kyrie-irving-nba-celtics-earth
2,2018-06-08T22:00:03+0000,https://www.nytimes.com/2018/06/08/sports/nba-...,Kevin Durant was named the finals’ M.V.P. agai...,"Warriors, in Full Dynasty Mode, Sweep Cavalier...",nba-finals-warriors-cavs
3,2018-06-08T17:33:17+0000,https://www.nytimes.com/2018/06/08/sports/lebr...,"If James wants to beat the Warriors, he may ne...","LeBron James Reveals an Injury, but His Destin...",lebron-james-free-agency
4,2018-06-08T13:16:38+0000,https://www.nytimes.com/2018/06/08/sports/game...,The animated show combining elements of “Game ...,The ‘Game of Zones’ Guys Knew You Wanted a Bry...,game-of-zones


In [6]:
dir_list = os.listdir('articles')
dir_list_split = [name.split('_')[0] for name in dir_list]

Check data frame has same size as directory of articles:

In [7]:
assert len(articles_list) == len(dir_list_split)

Total number of articles:

In [15]:
print(len(articles_list))

320


Sample article:

In [10]:
with open('articles/'+np.random.choice(dir_list)) as f:
    print(f.read()[:1000],'...')

RENO, Nev. — The Houston Rockets, who have seldom been accused of playing the most attractive brand of basketball this season, were running their usual one-on-one isolations against the Golden State Warriors on Thursday night, and Mariah Musselman, age 8, took it upon herself to count the number of times that James Harden dribbled during one possession.
“Eighteen, 19, 20,” she said as Harden’s teammates cleared the space around him and the shot clock ticked away.
She seemed both amazed and annoyed. Eric Musselman, who is Mariah’s father, is the men’s basketball coach at the University of Nevada, and he made it clear that he considers himself philosophically aligned with his daughter when it comes to the proper (or at least most pleasing) way to play the game. He appreciates passing. He does not like prolonged dribbling.
But as Game 5 of the Western Conference finals wore on, Musselman reclined on his couch and considered the peculiar way that the Rockets go about their business, for be

# Examining Corpus as a whole:

In [75]:
corpus = ''

for article in dir_list:
    with open(f'articles/{article}') as f:
        corpus += f.read()
        corpus += '\n'

corpus = corpus.replace('’','').replace('”','').replace('“','').replace('—','')

Approximate number of words per article (assuming 6 characters per word):

In [76]:
len(corpus)/(len(dir_list)*6)

797.4786458333333

In [77]:
corpus_blob = TextBlob(corpus)

Word Frequencies:

In [78]:
corpus_words = corpus_blob.words
corpus_words = [word.lower() for word in corpus_words if word.lower() not in stopwords.words('english')\
                                                         and word not in string.punctuation]
c = Counter(corpus_words)

In [79]:
c.most_common(500)

[('said', 1583),
 ('game', 1358),
 ('points', 962),
 ('team', 933),
 ('season', 844),
 ('n.b.a', 731),
 ('one', 700),
 ('first', 663),
 ('james', 633),
 ('players', 602),
 ('knicks', 590),
 ('two', 529),
 ('coach', 524),
 ('warriors', 521),
 ('games', 514),
 ('would', 513),
 ('like', 508),
 ('last', 479),
 ('teams', 463),
 ('time', 454),
 ('basketball', 451),
 ('new', 431),
 ('play', 404),
 ('league', 385),
 ('cavaliers', 370),
 ('player', 366),
 ('back', 351),
 ('even', 349),
 ('night', 348),
 ('made', 347),
 ('years', 346),
 ('also', 340),
 ('could', 333),
 ('get', 321),
 ('three', 308),
 ('golden', 308),
 ('going', 306),
 ('left', 304),
 ('celtics', 302),
 ('rockets', 298),
 ('quarter', 294),
 ('state', 290),
 ('rebounds', 288),
 ('much', 281),
 ('series', 280),
 ('way', 279),
 ('conference', 278),
 ('lead', 276),
 ('year', 270),
 ('scored', 264),
 ('cleveland', 264),
 ('second', 263),
 ('played', 258),
 ('finals', 258),
 ('minutes', 258),
 ('ball', 249),
 ('still', 248),
 ('mr', 24

Among most mentioned words we find James (as in Lebron), the Warriors, and, perhaps most interestingly, the Knicks.

Let's restrict ourselves to nouns:

In [80]:
tags = corpus_blob.pos_tags

In [81]:
tags = [tag for tag in tags if tag[1] in {'NN','NNS','NNP','NNPS'}]

In [82]:
tags = [Word(tag[0].lower()) for tag in tags]

In [83]:
lemma_tags = [tag.lemmatize() for tag in tags]
lemma_tags = [tag for tag in tags if tag not in ['s','t']]

In [84]:
c2 = Counter(lemma_tags)
c2.most_common(20)

[('game', 1350),
 ('points', 960),
 ('team', 927),
 ('season', 844),
 ('n.b.a', 679),
 ('james', 633),
 ('players', 602),
 ('knicks', 590),
 ('warriors', 521),
 ('games', 514),
 ('coach', 499),
 ('time', 454),
 ('teams', 447),
 ('basketball', 439),
 ('league', 377),
 ('cavaliers', 370),
 ('player', 366),
 ('night', 348),
 ('years', 346),
 ('celtics', 302)]

# Named Entity Recognition:

Really, we're interested in identifying the players and teams within this dataset, for this let's apply some NER with spacy.

In [85]:
nlp = spacy.load('en_core_web_sm')

In [86]:
doc = nlp(corpus[:10000])

In [89]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Virginia 32 40 GPE
Julius Erving 70 83 ORG
the New York Nets 87 104 ORG
1973 108 112 DATE
Erving 114 120 ORG
first 132 137 ORDINAL
Nassau Coliseum 149 164 ORG

 254 255 GPE
Erving 273 279 ORG

 386 387 GPE
Coliseum 391 399 ORG
Erving 428 434 ORG
Long Island 469 480 LOC
Roosevelt High School 504 525 ORG
three seasons 535 548 DATE
two 634 637 CARDINAL
A.B.A. 638 644 GPE
the N.B.A. and Erving 689 710 ORG
the Philadelphia 76ers 723 745 ORG

 746 747 GPE
Erving 751 757 ORG
Long Island 782 793 LOC
this weekend 857 869 DATE
Coliseum 923 931 GPE
first 940 945 ORDINAL
decades 954 961 DATE
J 972 973 PERSON

 1008 1009 GPE
The Long Island Nets 1009 1029 FAC
the N.B.A. G League 1039 1058 ORG
the Brooklyn Nets 1072 1089 ORG
Erving 1105 1111 ORG
Don Ryan 1138 1146 PERSON
Saturdays season 1151 1167 DATE
the Fort Wayne Mad Ants 1183 1206 ORG
New York Nets 1220 1233 GPE
Ryan 1262 1266 PERSON

 1364 1365 GPE
opening night 1375 1388 TIME
Alton Byrd 1487 1497 PERSON

 1559 1560 GPE
Coliseum 1657 1665 PERS