# Extract articles from ESPN.com

Requirements:

* beautifulsoup4  (4.6.0)

In [2]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [4]:
def pprint(soup):# pretty print
    print(soup.prettify())

In [5]:
html_page =  requests.get("http://www.espn.com/nfl/team/_/name/mia/miami-dolphins")
soup = BeautifulSoup(html_page.content, "lxml")

In [6]:
# we know that all article links are in the 'article' tag
articles = [art for art in soup.find_all('article') if 'data-id' in art.attrs.keys()] # remove extraneous articles
print(len(articles))


25


In [7]:
for art in articles:
    print('---'*30)
    print(art['class'])
    print(art.a['class'])

------------------------------------------------------------------------------------------
['news-feed-item', 'news-feed-story-package']
['story-link']
------------------------------------------------------------------------------------------
['news-feed-item', 'video-standalone', 'video']
['story-link']
------------------------------------------------------------------------------------------
['news-feed-item', 'news-feed-story-package']
['story-link']
------------------------------------------------------------------------------------------
['news-feed-item', 'news-now', 'news-feed-shortstop']
['btn-social', 'sm', 'icon-font-before', 'icon-facebook-solid-before', 'Shortstop']
------------------------------------------------------------------------------------------
['news-feed-item', 'news-now', 'news-feed-shortstop']
['btn-social', 'sm', 'icon-font-before', 'icon-facebook-solid-before', 'Shortstop']
------------------------------------------------------------------------------------

In [8]:
article = articles[0]
for child in article.children:
    print(child.name)
print()
pprint(article)

a
figure
div

<article class="news-feed-item news-feed-story-package" data-id="25998951">
 <a class="story-link" data-id="25998951" data-popup-href="http://www.espn.com/nfl/story/_/id/25998951/how-nfl-worst-quarterbacks-improve-2019" data-sport="nfl" href="/nfl/story/_/id/25998951/how-nfl-worst-quarterbacks-improve-2019" name="&amp;lpos=nfl:feed:xx:news">
 </a>
 <figure class="feed-item-figure ">
  <div class="img-wrap">
   <a data-mptype="image" data-sport="nfl" href="/nfl/story/_/id/25998951/how-nfl-worst-quarterbacks-improve-2019" name="&amp;lpos=nfl:feed:xx:news">
    <picture>
     <source data-srcset="https://a1.espncdn.com/combiner/i?img=%2Fphoto%2F2019%2F0213%2Fworst_QBs_5x2.jpg&amp;w=375&amp;h=150&amp;scale=crop&amp;cquality=80&amp;location=origin, https://a1.espncdn.com/combiner/i?img=%2Fphoto%2F2019%2F0213%2Fworst_QBs_5x2.jpg&amp;w=768&amp;h=307&amp;scale=crop&amp;cquality=40&amp;location=origin&amp;format=jpg 2x" media="(max-width: 375px)">
      <source data-srcset="https:

In [9]:
article_info = {}

#if article['class'] = ['news-feed-item', 'news-feed-story-package']:
    
for child in article.children:
    if child.name == 'a':
        article_info['class'] = child['class'][0]
        article_info['data-id'] = child['data-id']
        article_info['url'] = child['data-popup-href']
        article_info['sport'] = child['data-sport']
    if child.name == 'div':
        for span in child.div.div.children: # should be a timestap and author span tag
            if 'timestamp' in span['class']: # Beautiful soup always makes the class a list (NOT a string)
                article_info['timestamp'] = span.string
            elif 'author' in span['class']:
                article_info['author'] = span.string

article_info

{'author': 'ESPN.com',
 'class': 'story-link',
 'data-id': '25998951',
 'sport': 'nfl',
 'timestamp': '1d',
 'url': 'http://www.espn.com/nfl/story/_/id/25998951/how-nfl-worst-quarterbacks-improve-2019'}

In [10]:
def extract_articles(soup, teamname): 
    articles = [art for art in soup.find_all('article') if 'data-id' in art.attrs.keys()]

    articles_list = []
    for idx, article in enumerate(articles):
        try:
            if 'story-link' in article.a['class']: # should be ['news-feed-item', 'news-feed-story-package']
                article_info = {}
                article_info['teamname'] = teamname
                for child in article.children:
                    if child.name == 'a':
                        article_info['class'] = child['class'][0]
                        article_info['data-id'] = child['data-id']
                        article_info['url'] = child['data-popup-href']
                        article_info['sport'] = child['data-sport']
                    if child.name == 'div':
                        for span in child.div.div.children: # should be a timestap and author span tag
                            if 'timestamp' in span['class']: # Beautiful soup always makes the class a list (NOT a string)
                                article_info['timestamp'] = span.string
                            elif 'author' in span['class']:
                                article_info['author'] = span.string
                                articles_list.append(article_info)
                    
        except e:
            print('*** {}: {}'.format(idx, e))
                            
      
    
    # convert list of dictionaries into dataframe
    df = pd.DataFrame(articles_list)
    return df

def get_df_from_teamname_link(link):
    teamname = link.split('/')[-1] # grab top link (ex 'buffalo-bills')
    html_page =  requests.get(link)
    soup = BeautifulSoup(html_page.content, 'lxml')
    df = extract_articles(soup, teamname)
    return df

def get_df_from_teamname_links(link_list):
    list_of_dfs = []
    for link in link_list:
        print('Teampage: {}'.format(link))
        teamname = link.split('/')[-1] # grab top link (ex 'buffalo-bills')
        html_page =  requests.get(link)
        soup = BeautifulSoup(html_page.content, 'lxml')
        df = extract_articles(soup, teamname)
        list_of_dfs.append(df)
        
    dfs = pd.concat(list_of_dfs)
    print("Dropping NaN and duplicates")
    dfs = dfs.drop("NaN")
    dfs.drop_duplicates('data-id')
    
    return dfs

In [11]:
links= ["http://www.espn.com/nfl/team/_/name/buf/buffalo-bills",
        "http://www.espn.com/nfl/team/_/name/mia/miami-dolphins",
        "http://www.espn.com/nfl/team/_/name/cle/cleveland-browns",
        "http://www.espn.com/nfl/team/_/name/ne/new-england-patriots",
        "http://www.espn.com/nfl/team/_/name/nyj/new-york-jets",
        "http://www.espn.com/nfl/team/_/name/dal/dallas-cowboys",
        "http://www.espn.com/nfl/team/_/name/nyg/new-york-giants",
        "http://www.espn.com/nfl/team/_/name/phi/philadelphia-eagles",
        "http://www.espn.com/nfl/team/_/name/wsh/washington-redskins",
        "http://www.espn.com/nfl/team/_/name/bal/baltimore-ravens",
        "http://www.espn.com/nfl/team/_/name/cin/cincinnati-bengals",
        "http://www.espn.com/nfl/team/_/name/cle/cleveland-browns",
        "http://www.espn.com/nfl/team/_/name/pit/pittsburgh-steelers",
        "http://www.espn.com/nfl/team/_/name/chi/chicago-bears",
        "http://www.espn.com/nfl/team/_/name/det/detroit-lions",
        "http://www.espn.com/nfl/team/_/name/gb/green-bay-packers",
        "http://www.espn.com/nfl/team/_/name/min/minnesota-vikings",
        "http://www.espn.com/nfl/team/_/name/hou/houston-texans",
        "http://www.espn.com/nfl/team/_/name/ind/indianapolis-colts",
        "http://www.espn.com/nfl/team/_/name/jax/jacksonville-jaguars",
        "http://www.espn.com/nfl/team/_/name/ten/tennessee-titans",
        "http://www.espn.com/nfl/team/_/name/atl/atlanta-falcons",
        "http://www.espn.com/nfl/team/_/name/car/carolina-panthers",
        "http://www.espn.com/nfl/team/_/name/no/new-orleans-saints",
        "http://www.espn.com/nfl/team/_/name/tb/tampa-bay-buccaneers",
        "http://www.espn.com/nfl/team/_/name/den/denver-broncos",
        "http://www.espn.com/nfl/team/_/name/kc/kansas-city-chiefs",
        "http://www.espn.com/nfl/team/_/name/lac/los-angeles-chargers",
        "http://www.espn.com/nfl/team/_/name/oak/oakland-raiders",
        "http://www.espn.com/nfl/team/_/name/ari/arizona-cardinals",
        "http://www.espn.com/nfl/team/_/name/lar/los-angeles-rams",
        "http://www.espn.com/nfl/team/_/name/sf/san-francisco-49ers",
        "http://www.espn.com/nfl/team/_/name/sea/seattle-seahawks",
        ]

dfs = get_df_from_teamname_links(links)

dfs

Teampage: http://www.espn.com/nfl/team/_/name/buf/buffalo-bills
Teampage: http://www.espn.com/nfl/team/_/name/mia/miami-dolphins
Teampage: http://www.espn.com/nfl/team/_/name/cle/cleveland-browns
Teampage: http://www.espn.com/nfl/team/_/name/ne/new-england-patriots
Teampage: http://www.espn.com/nfl/team/_/name/nyj/new-york-jets
Teampage: http://www.espn.com/nfl/team/_/name/dal/dallas-cowboys
Teampage: http://www.espn.com/nfl/team/_/name/nyg/new-york-giants
Teampage: http://www.espn.com/nfl/team/_/name/phi/philadelphia-eagles
Teampage: http://www.espn.com/nfl/team/_/name/wsh/washington-redskins
Teampage: http://www.espn.com/nfl/team/_/name/bal/baltimore-ravens
Teampage: http://www.espn.com/nfl/team/_/name/cin/cincinnati-bengals
Teampage: http://www.espn.com/nfl/team/_/name/cle/cleveland-browns
Teampage: http://www.espn.com/nfl/team/_/name/pit/pittsburgh-steelers
Teampage: http://www.espn.com/nfl/team/_/name/chi/chicago-bears
Teampage: http://www.espn.com/nfl/team/_/name/det/detroit-lion

Unnamed: 0,author,class,data-id,sport,teamname,timestamp,url
0,ESPN.com,story-link,25998951,nfl,buffalo-bills,1d,http://www.espn.com/nfl/story/_/id/25998951/ho...
1,Mike Rodak,story-link,26026509,nfl,buffalo-bills,1d,http://www.espn.com/nfl/story/_/id/26026509/bi...
2,Mike Rodak,story-link,26005150,nfl,buffalo-bills,4d,http://www.espn.com/nfl/story/_/id/26005150/bi...
3,Jeremy Willis,story-link,25995553,nfl,buffalo-bills,6d,http://www.espn.com/nfl/story/_/id/25995553/nf...
4,Mike Rodak,story-link,25980647,nfl,buffalo-bills,8d,http://www.espn.com/nfl/story/_/id/25980647/bi...
5,Mike Rodak,story-link,buffalo-bills-32874,nfl,buffalo-bills,13d,http://espn.com/blog/buffalo-bills/post/_/id/3...
6,Jeremy Willis,story-link,25932328,nfl,buffalo-bills,14d,http://www.espn.com/nfl/story/_/id/25932328/nf...
7,Bill Barnwell,story-link,25834281,nfl,buffalo-bills,23d,http://www.espn.com/nfl/story/_/id/25834281/pr...
8,ESPN,story-link,nflnation-292685,nfl,buffalo-bills,26d,http://espn.com/blog/nflnation/post/_/id/29268...
9,Mike Rodak,story-link,buffalo-bills-32847,nfl,buffalo-bills,26d,http://espn.com/blog/buffalo-bills/post/_/id/3...


In [14]:
export_csv = dfs.to_csv(r'C:\Users\atenk\Documents\ISM\HeadlineGeneration\ESPN_football.csv', index=False)