In [100]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
website = 'https://www.nytimes.com/interactive/2017/06/23/opinion/trumps-lies.html'
r = requests.get(website)

In [4]:
# print the first 500 words of the html text
print(r.text[:500])

<!DOCTYPE html>
<!--[if (gt IE 9)|!(IE)]> <!--><html lang="en" class="no-js page-interactive section-opinion page-theme-standard tone-opinion page-interactive-default limit-small layout-xlarge app-interactive" itemid="https://www.nytimes.com/interactive/2017/06/23/opinion/trumps-lies.html" itemtype="http://schema.org/NewsArticle" itemscope xmlns:og="http://opengraphprotocol.org/schema/"><!--<![endif]-->
<!--[if IE 9]> <html lang="en" class="no-js ie9 lt-ie10 page-interactive section-opinion page


In [5]:
# parsing the html using BeautifulSoup
soup = BeautifulSoup(r.text, 'html.parser')

In [7]:
# collecting all of the records
# find_all find all the 'span' tags 
results = soup.find_all('span', attrs ={'class':'short-desc'})

In [8]:
# results is an object like a list so we can use len 
len(results)

180

In [9]:
# can also slice it
# this outputs the first 3 'span' tags ('lie')
results[0:3]

[<span class="short-desc"><strong>Jan. 21 </strong>“I wasn't a fan of Iraq. I didn't want to go into Iraq.” <span class="short-truth"><a href="https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the" target="_blank">(He was for an invasion before he was against it.)</a></span></span>,
 <span class="short-desc"><strong>Jan. 21 </strong>“A reporter for Time magazine — and I have been on their cover 14 or 15 times. I think we have the all-time record in the history of Time magazine.” <span class="short-truth"><a href="http://nation.time.com/2013/11/06/10-things-you-didnt-know-about-time/" target="_blank">(Trump was on the cover 11 times and Nixon appeared 55 times.)</a></span></span>,
 <span class="short-desc"><strong>Jan. 23 </strong>“Between 3 million and 5 million illegal votes caused me to lose the popular vote.” <span class="short-truth"><a href="https://www.nytimes.com/2017/01/23/us/politics/donald-trump-congress-democrats.html" target="_

In [10]:
# output the last 'span' tag 
results[-1]

<span class="short-desc"><strong>Nov. 11 </strong>“I'd rather have him  – you know, work with him on the Ukraine than standing and arguing about whether or not  – because that whole thing was set up by the Democrats.” <span class="short-truth"><a href="https://www.nytimes.com/interactive/2017/12/10/us/politics/trump-and-russia.html" target="_blank">(There is no evidence that Democrats "set up" Russian interference in the election.)</a></span></span>

### extracting the date 

In [11]:
first_result = results[0]
first_result

<span class="short-desc"><strong>Jan. 21 </strong>“I wasn't a fan of Iraq. I didn't want to go into Iraq.” <span class="short-truth"><a href="https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the" target="_blank">(He was for an invasion before he was against it.)</a></span></span>

In [15]:
# find the first 'strong' tag 
first_result.find('strong')

<strong>Jan. 21 </strong>

In [16]:
# then use text method to output the string part of the tag
first_result.find('strong').text

'Jan. 21\xa0'

In [20]:
# slice off the escape sequence '\xa0', which is a single object
first_result.find('strong').text[0:-1]

'Jan. 21'

In [22]:
# then add the year
first_result.find('strong').text[0:-1] + ', 2017'

'Jan. 21, 2017'

### extracting a lie 

In [24]:
first_result

<span class="short-desc"><strong>Jan. 21 </strong>“I wasn't a fan of Iraq. I didn't want to go into Iraq.” <span class="short-truth"><a href="https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the" target="_blank">(He was for an invasion before he was against it.)</a></span></span>

In [29]:
# output the list that contains the childern of first_result
# children are tags and strings that are nested within a tag 
first_result.contents

[<strong>Jan. 21 </strong>,
 "“I wasn't a fan of Iraq. I didn't want to go into Iraq.” ",
 <span class="short-truth"><a href="https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the" target="_blank">(He was for an invasion before he was against it.)</a></span>]

In [51]:
# 1st child, which is the date. can use this method to extract date. 
first_result.contents[0]

<strong>Jan. 21 </strong>

In [31]:
# 2nd child, which is the lie we want to extract
first_result.contents[1]

"“I wasn't a fan of Iraq. I didn't want to go into Iraq.” "

In [32]:
# slice off quotation mark and an empty space at the end 
first_result.contents[1][1:-2]

"I wasn't a fan of Iraq. I didn't want to go into Iraq."

### extracting the explanation 

In [33]:
# first option: slice the content attribute
first_result.contents[2]

<span class="short-truth"><a href="https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the" target="_blank">(He was for an invasion before he was against it.)</a></span>

In [54]:
# second option: search by the 'span' tag
first_result.find('span', attrs = {'class':'short-truth'}).text

'(He was for an invasion before he was against it.)'

In [35]:
# 3rd option: search by the surrouding tag
first_result.find('a')

<a href="https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the" target="_blank">(He was for an invasion before he was against it.)</a>

In [41]:
# use text method to output the string part of the tag
first_result.find('a').text

'(He was for an invasion before he was against it.)'

In [44]:
# then slice off the surrounding () and ''
first_result.find('a').text[1:-1]

'He was for an invasion before he was against it.'

### extracting the url

In [46]:
first_result.find('a')

<a href="https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the" target="_blank">(He was for an invasion before he was against it.)</a>

In [48]:
# under the attribute 'a', href is like the dictionary key, 
# and the str after = is like dictionary value
first_result.find('a')['href']

'https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the'

### building the dataset 

In [102]:
record = []

for result in results:
    date = result.find('strong').text[0:-1] + ', 2017'
    lie = result.contents[1][1:-2]
    explanation = result.find('a').text[1:-1]
    url = result.find('a')['href']
    lie_dict = {'date':date, 'lie':lie, 'explanation':explanation, 'url':url}
    record.append(lie_dict)


In [105]:
lie_df = pd.DataFrame(record)
lie_df

Unnamed: 0,date,lie,explanation,url
0,"Jan. 21, 2017",I wasn't a fan of Iraq. I didn't want to go in...,He was for an invasion before he was against it.,https://www.buzzfeed.com/andrewkaczynski/in-20...
1,"Jan. 21, 2017",A reporter for Time magazine — and I have been...,Trump was on the cover 11 times and Nixon appe...,http://nation.time.com/2013/11/06/10-things-yo...
2,"Jan. 23, 2017",Between 3 million and 5 million illegal votes ...,There's no evidence of illegal voting.,https://www.nytimes.com/2017/01/23/us/politics...
3,"Jan. 25, 2017","Now, the audience was the biggest ever. But th...",Official aerial photos show Obama's 2009 inaug...,https://www.nytimes.com/2017/01/21/us/politics...
4,"Jan. 25, 2017",Take a look at the Pew reports (which show vot...,The report never mentioned voter fraud.,https://www.nytimes.com/2017/01/24/us/politics...
...,...,...,...,...
175,"Oct. 25, 2017",We have trade deficits with almost everybody.,We have trade surpluses with more than 100 cou...,https://www.bea.gov/newsreleases/international...
176,"Oct. 27, 2017","Wacky & totally unhinged Tom Steyer, who has b...",Steyer has financially supported many winning ...,https://www.opensecrets.org/donor-lookup/resul...
177,"Nov. 1, 2017","Again, we're the highest-taxed nation, just ab...",We're not.,http://www.politifact.com/truth-o-meter/statem...
178,"Nov. 7, 2017",When you look at the city with the strongest g...,"Several other cities, including New York and L...",http://www.politifact.com/truth-o-meter/statem...


In [106]:
lie_df.date = pd.to_datetime(lie_df.date)
lie_df.head()

Unnamed: 0,date,lie,explanation,url
0,2017-01-21,I wasn't a fan of Iraq. I didn't want to go in...,He was for an invasion before he was against it.,https://www.buzzfeed.com/andrewkaczynski/in-20...
1,2017-01-21,A reporter for Time magazine — and I have been...,Trump was on the cover 11 times and Nixon appe...,http://nation.time.com/2013/11/06/10-things-yo...
2,2017-01-23,Between 3 million and 5 million illegal votes ...,There's no evidence of illegal voting.,https://www.nytimes.com/2017/01/23/us/politics...
3,2017-01-25,"Now, the audience was the biggest ever. But th...",Official aerial photos show Obama's 2009 inaug...,https://www.nytimes.com/2017/01/21/us/politics...
4,2017-01-25,Take a look at the Pew reports (which show vot...,The report never mentioned voter fraud.,https://www.nytimes.com/2017/01/24/us/politics...


### exporting the dataset to a csv file

In [108]:
# don't include index
lie_df.to_csv("trump_lies.csv", index=False, encoding='utf-8')

In [112]:
# double-check the csv file
df = pd.read_csv("trump_lies.csv")
df.date.dtype

dtype('O')

In [114]:
# need to parse_dates to tell pandas to convert 'date' col to datetime format 
df = pd.read_csv("trump_lies.csv", parse_dates=['date'])
df.date.dtype

dtype('<M8[ns]')