In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import re

In [2]:
link = "http://changingminds.org/disciplines/warfare/art_war/sun_tzu_annotated.htm"

In [3]:
raw_html = requests.get(link).text

In [4]:
link_tokens = re.findall('href="sun.*htm"', raw_html)

In [5]:
link_tokens[0]

'href="sun_tzu_1-1.htm"'

In [6]:
link_regex = r'href="(.*?)"'
letter_links = [re.match(link_regex, token).group(1) for token in link_tokens]

In [7]:
# remove duplicates, retain order
letter_links = list(dict.fromkeys(letter_links))

In [59]:
def get_quotes_and_commentary(soup):
    content_rows = soup.find_all('table')[2].find_all('tr')[2:]
    quotes = []
    commentary_lines = []
    for row in content_rows:
        cols = row.find_all('td')
        quote = re.sub('[\r\t\n]', '', cols[0].text)
        quote = '.'.join(quote.split('.')[1:]).strip()
        commentary = re.sub('[\r\t\n]', '', cols[1].text).strip()
        quotes.append(quote)
        commentary_lines.append(commentary)
    return quotes, commentary_lines

def parse_link(letter_page):
    page_html = requests.get(letter_page).text
    try:
        soup = BeautifulSoup(page_html, 'lxml')
        return get_quotes_and_commentary(soup)
    except Exception as e:
        print(letter_page)
        print(e)
        raise e
   

In [60]:
base_link = "http://changingminds.org/disciplines/warfare/art_war"

all_quotes = []
all_commentaries = []

for link in letter_links:
    letter_page = "{}/{}".format(base_link, link)
    quotes, commentary = parse_link(letter_page)
    # print(quotes, commentary)
    all_quotes.extend(quotes)
    all_commentaries.extend(commentary)

In [63]:
map(int, [3,4,'5'])

<map at 0x7fc520b5d630>

In [67]:
# fix_commentary
for i in range(len(all_commentaries)):
    all_commentaries[i] = '. '.join(list(map(str.strip, all_commentaries[i].split('.')))).strip()

In [68]:
with open('data/aow_commentary_by_quote.csv', 'w+', encoding='utf-8') as f:
    writer = csv.writer(f)

    writer.writerow(['title', 'body'])

    for title, line in zip(all_quotes, all_commentaries):
        writer.writerow([title, line])

In [58]:
letter_page = "http://changingminds.org/disciplines/warfare/art_war/sun_tzu_3-2.htm"
page_html = requests.get(letter_page).text
soup = BeautifulSoup(page_html, 'lxml')
get_quotes_and_commentary(soup)

(["Therefore the skillful leader subdues the enemy's troops without any fighting; he captures their cities without laying siege to them; he overthrows their kingdom without lengthy operations in the field.",
  'With his forces intact he will dispute the mastery of the Empire, and thus, without losing a man, his triumph will be complete. This is the method of attacking by stratagem.',
  "It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.",
  'If equally matched, we can offer battle; if slightly inferior in numbers, we can avoid the enemy; if quite unequal in every way, we can flee from him.',
  'Hence, though an obstinate fight may be made by a small force, in the end it must be captured by the larger force.'],
 ['The best way of fighting is to avoid fighting. The best way to win a war is with superior strategy that out-thinks, out-plans and out-maneuvers the enemy such that 

In [41]:
print(soup.prettify())

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html>
 <head>
  <title>
   The Annotated Art of War (Parts 3.6-10: Fighting Strategy)
  </title>
  <meta content="war,battle,warfare,fighting,tactics,sun tzu,sun tzi,suntzu,suntsu,sun tsu" name="keywords"/>
  <meta content="Here's an annotated version of Sun Tzu's 'Art of War' with commentary on every section. This is Parts 3.6-10: Fighting Strategy." name="description"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <link href="../../../persuade.css" rel="stylesheet"/>
  <!-- inc head -->
  <meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport"/>
  <!-- google stuff -->
  <script src="http://www.google-analytics.com/urchin.js" type="text/javascript">
  </script>
  <script type="text/javascript">
   _uacct = "UA-831699-1";
urchinTracker();
  </script>
  <!-- google translate -->
  <meta content="dc11d0af51c3c

In [57]:
soup.find_all('tr')[3:7]#.find_all('tr')[2].find_all('td')

[<tr>
 <td align="center" bgcolor="#FFCCFF" width="50%"><b>Sun Tzu said:</b></td>
 <td align="center" bgcolor="#00FFCC" width="50%"><b>Commentary</b></td>
 </tr>, <tr>
 <td bgcolor="#FFECFF" valign="top" width="50%">
 6. Therefore the skillful leader subdues the enemy's troops without any 
 fighting; he captures their cities without laying siege to them; he overthrows 
 their kingdom without lengthy operations in the field. </td></tr>, <tr>
 <td bgcolor="#FFECFF" valign="top" width="50%">7. With his forces intact he will dispute the mastery of the Empire, and 
 thus, without losing a man, his triumph will be complete. This is the method of 
 attacking by stratagem. </td>
 <td bgcolor="#E8FFFA" valign="top" width="50%">Fighting wars 
 			depletes forces which limits the number of wars a commander can 
 			wage. If, however, soldiers are not lost and few munitions are used, 
 			then the army may march and march to war, defeating all in its path 
 			with little incremental cost.</td>
 <

In [37]:
soup.find_all('table')[2].find_all('tr')[0].find_all('td')

[<td>
 <!--starttext-->
 <h1>The Annotated Art of War (Parts Parts 3.6-10: Fighting Strategy)</h1>
 <p align="center" class="smaller"> </p>
 <p align="center" class="smaller"><a href="../../disciplines.htm">Disciplines</a> &gt; 
 <a href="../warfare.htm">Warfare</a> &gt; <a href="sun_tzu_annotated.htm">The 
 Annotated Art of War</a> &gt; Parts Parts 3.6-10: Fighting Strategy</p>
 <p align="center" class="smaller">  <a href="sun_tzu_2-1.htm">Previous 
 chapter</a> &lt;&lt; Chapter: 3 &gt;&gt; <a href="sun_tzu_4-1.htm">Next chapter</a></p>
 <p align="center" class="smaller"><a href="sun_tzu_3-1.htm">Previous part</a> | 
 <a href="sun_tzu_3-3.htm">Next part</a></p>
 <p align="center" class="smaller"> </p>
 <h2 align="left">III. Attack by Stratagem</h2>
 <p align="left"> </p>
 <div align="center">
 <table border="1" bordercolor="#000080" cellpadding="4" cellspacing="0" id="table13" width="90%">
 <tr>
 <td align="center" bgcolor="#FFCCFF" width="50%"><b>Sun Tzu said:</b></td>
 <td align="ce

In [97]:
soup.find_all('table')[2].find_all('tr')[-1].find_all('td')[1]
#re.sub('[\r\t\n]', '', soup.find_all('table')[2].find_all('tr')[-1].find_all('td')[1].text)

<td bgcolor="#E8FFFA" valign="top" width="50%">War will happen, with 
			you or without you. If you are not strong and wily then others will 
			overrun you. If you wage war unwisely, you will only weaken 
			yourself.<p>In business, focusing on customers is not enough. 
			Sounding aggressive is also not enough. Spending wildly on 
			advertising campaigns or otherwise blithely attacking others may 
			also lead to ruin. </p>
<p>War and competition needs intelligent and constant attention. </p></td>

In [None]:
letter_tokens = [lt for lt in letter_tokens if "href" not in lt]
letters = [re.match(letter_regex, lt).group(1) for lt in letter_tokens]
for letter in letters:
    fh.write(letter + "\n")