In [2]:
class DebateData(object):
    '''An election debate Data extration class'''

    def __init__(self, year=2016):
        ''' year:  year of election
            election_range: how far back to grab urls for election year
            text_source: data source containing links for debate transcripts
            urls: all debate urls for this election year
            year_link_lst: list of tuples that contain year and link (year, link) for all debates
            speakers: list of speakers for this years election (includes any and all speakers)
            '''
        
        self.year = year
        self.election_range = 0
        self.text_source = "http://www.presidency.ucsb.edu/debates.php"
        self.urls_ = []
        self.year_link_list = []
        self.speakers = []
        
    def get_election_links(self):
        """Based off election the year, extract the debate text from the debates
        that occured for that election"""
        soup = self.get_soup(self.text_source)
        #Traverse the html tree structure and get the link and correspoding date
        years, links, modifiedlist=[],[],[]
        for idx, row in enumerate(soup.find_all('td')):
            for i in row.find_all('td'):
                for j in i.find_all('td'):
                    for k in j.find_all('td'):
                        for l in k.find_all('tr'):
                            if l.find_all(class_="docdate") and l.find_all(class_="doctext") and l.find_all('a'):
                                for link in l.find_all('a'):
                                    links.append(link.get('href'))
                                temp = str(l.find_all(class_='docdate'))
                                yr = temp.split(", ")[1][:4]
                                years.append(int(yr))
        self.year_link_list = sorted(zip(years,links), reverse = True)
        #if we want to return all debates/candidates or if we want just the debates/candidates for an election year
        if self.year == 'all':
            return self.year_link_list
        else:
            self.election_range = self.year-4
            for debyear, deblink in self.year_link_list:
                if debyear > self.election_range and debyear <= self.year:
                    self.urls_.append(deblink)
                    modifiedlist.append((debyear,deblink))
                self.year_link_list = modifiedlist
    
    def get_soup(self, link): 
        "Build the soup"
        return BeautifulSoup(requests.get(link).content, 'html.parser')
                    
    def get_speakers(self):
        """Identify the speakers (candidates, moderators, panelists) by using bold html tags. 
        Remove bold tags that are not speakers"""
        self.speakers=[]
        for debate in self.urls_:
            debspeakers=[]
            soup = self.get_soup(debate)
            texttest = soup.find(class_='displaytext')
            speakers = texttest.find_all('b')
            for speaker in speakers:
                spkrtrm = speaker.get_text()
                #get rid of bold tokens that are not speakers
                if spkrtrm not in ("PARTICIPANTS:","MODERATOR:","PANELIST:"
                                   ,"MODERATORS:","PANELISTS:"," ",'.','...','???:','] ' ):
                    debspeakers.append(spkrtrm)
            self.speakers.extend(debspeakers)
        self.speakers = set(self.speakers)
    
    def get_text(self):
        "get the text for each candidate (TRUMP,CRUZ,KASICH,CLINTON,SANDERS)"
        all_text = []
        for link in self.urls_:
            soup = self.get_soup(link)
            text = soup.find(class_='displaytext').get_text()
            all_text.append(text)
        all_text = ",".join(all_text)
        all_text.replace("\'","'")
        #SOURCE - http://www.dotnetperls.com/find-python
        lst = []
        for speaker in self.speakers:
        # Start with this value.
            location = -1
            while True:
            # Advance location by 1.
                location = all_text.find(speaker, location +1)
                if location != -1:
                    lst.append((location,speaker))
                if location == -1: break
        locspkr = sorted(lst)
        textdict = defaultdict(list)
        #Current remaning candidates
        cand = ["TRUMP:","CRUZ:","KASICH:","CLINTON:","SANDERS:"]
        for idx,(i,j) in enumerate(locspkr):
            if j in cand:
                start = i + len(j)
                if idx+1 == len(locspkr):
                    textdict[j].append(all_text[start:locspkr[idx][0]])
                else:
                    textdict[j].append(all_text[start:locspkr[idx+1][0]])
        return textdict

In [4]:
import requests
from bs4 import BeautifulSoup
import sys
import re
from collections import defaultdict, Counter
import nltk
import sklearn
from sklearn.naive_bayes import MultinomialNB 
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
from sklearn import feature_extraction
from sklearn import cross_validation
from sklearn import linear_model
from sklearn import grid_search
from sklearn import ensemble
from sklearn import neighbors
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline
sys.setrecursionlimit(2000)

In [5]:
#Create debate data instance for 2016 election year
dd = DebateData(2016)
#Get the links that have the transcripts
dd.get_election_links()
#get all of the speakers from the debates
dd.get_speakers()
#Returns a dictionary with candidates as keys and a list of strings as their comments
dtext = dd.get_text()

In [6]:
dd.urls_


['http://www.presidency.ucsb.edu/ws/index.php?pid=116995',
 'http://www.presidency.ucsb.edu/ws/index.php?pid=115148',
 'http://www.presidency.ucsb.edu/ws/index.php?pid=112719',
 'http://www.presidency.ucsb.edu/ws/index.php?pid=112718',
 'http://www.presidency.ucsb.edu/ws/index.php?pid=111711',
 'http://www.presidency.ucsb.edu/ws/index.php?pid=111634',
 'http://www.presidency.ucsb.edu/ws/index.php?pid=111520',
 'http://www.presidency.ucsb.edu/ws/index.php?pid=111500',
 'http://www.presidency.ucsb.edu/ws/index.php?pid=111472',
 'http://www.presidency.ucsb.edu/ws/index.php?pid=111471',
 'http://www.presidency.ucsb.edu/ws/index.php?pid=111413',
 'http://www.presidency.ucsb.edu/ws/index.php?pid=111412',
 'http://www.presidency.ucsb.edu/ws/index.php?pid=111409',
 'http://www.presidency.ucsb.edu/ws/index.php?pid=111395',
 'http://www.presidency.ucsb.edu/ws/index.php?pid=111394',
 'http://www.presidency.ucsb.edu/ws/index.php?pid=111178',
 'http://www.presidency.ucsb.edu/ws/index.php?pid=111177

In [5]:
link = 'http://www.presidency.ucsb.edu/debates.php'

In [6]:
soup = BeautifulSoup(requests.get(link).content, 'html.parser')

In [7]:
soup

<html>
<head>
<title>Presidential Debates</title>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type">
<meta content="President of the United States, presidency, American Presidency, American President, Public Papers of the Presidents, State of the Union Address, Inaugural Address, Presidents, American Presidents, George W. Bush, Bill Clinton, George Bush, Ronald Reagan, Jimmy Carter, Gerald Ford, Richard Nixon, Lyndon Johnson, John F. Kennedy. John Kennedy, Dwight Eisenhower, Harry Truman, FDR, Franklin Roosevelt, Presidential Elections, Presidential Rhetoric" name="keywords">
<meta content="The American Presidency Project contains the most comprehensive collection of resources pertaining to the study of the President of the United States.  Compiled by John Woolley and Gerhard Peters" name="description">
<link href="http://www.presidency.ucsb.edu/styles/main.css" rel="stylesheet" type="text/css">
<!-- BEGIN Tynt Script -->
<!-- <script type="text/javascript">
if(documen

In [21]:
soup.findAllNext?

In [None]:
for j in soup.find_all('td'):
    for i in soup.find_all('tr'):
        #print(i.find_all(class_="docdate"))
        if i.find_all(class_="docdate") and i.find_all(class_="doctext") and i.find_all('a'):
            for link in i.find_all('a'):
                print(link.get('href'))
              

https://www.giveucsb.com/APP.htm
http://www.presidency.ucsb.edu/ws
http://www.presidency.ucsb.edu/sou.php
http://www.presidency.ucsb.edu/inaugurals.php
http://www.presidency.ucsb.edu/satradio.php
http://www.presidency.ucsb.edu/fireside.php
http://www.presidency.ucsb.edu/news_conferences.php
http://www.presidency.ucsb.edu/executive_orders.php
http://www.presidency.ucsb.edu/proclamations.php
http://www.presidency.ucsb.edu/signingstatements.php
http://www.presidency.ucsb.edu/press_briefings.php
http://www.presidency.ucsb.edu/saps.php
http://www.presidency.ucsb.edu/economic_reports.php
http://www.presidency.ucsb.edu/debates.php
http://www.presidency.ucsb.edu/nomination.php
http://www.presidency.ucsb.edu/platforms.php
http://www.presidency.ucsb.edu/2016_election.php
http://www.presidency.ucsb.edu/2012_election.php
http://www.presidency.ucsb.edu/2008_election.php
http://www.presidency.ucsb.edu/2004_election.php
http://www.presidency.ucsb.edu/1960_election.php
http://www.presidency.ucsb.edu/t

In [None]:
    def get_election_links(self):
        """Based off election the year, extract the debate text from the debates
        that occured for that election"""
        soup = self.get_soup(self.text_source)
        #Traverse the html tree structure and get the link and correspoding date
        years, links, modifiedlist=[],[],[]
        for idx, row in enumerate(soup.find_all('td')):
            for i in row.find_all('td'):
                for j in i.find_all('td'):
                    for k in j.find_all('td'):
                        for l in k.find_all('tr'):
                            if l.find_all(class_="docdate") and l.find_all(class_="doctext") and l.find_all('a'):
                                for link in l.find_all('a'):
                                    links.append(link.get('href'))
                                temp = str(l.find_all(class_='docdate'))
                                yr = temp.split(", ")[1][:4]
                                years.append(int(yr))
        self.year_link_list = sorted(zip(years,links), reverse = True)
        #if we want to return all debates/candidates or if we want just the debates/candidates for an election year
        if self.year == 'all':
            return self.year_link_list
        else:
            self.election_range = self.year-4
            for debyear, deblink in self.year_link_list:
                if debyear > self.election_range and debyear <= self.year:
                    self.urls_.append(deblink)
                    modifiedlist.append((debyear,deblink))
                self.year_link_list = modifiedlist