# Scraper Chunking

To avoid large losses in data let's break the task down into smaller pieces. We'll create scripts to scrape the url list in 200 line chunks. Each script will be saved to a csv and combined in a later process.

Before getting into that, let's fix the scraper so it doesn't break down when a div is missing.

In [1]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import urllib
import os
os.chdir('/Users/courtneyfergusonlee/ufc_fight_analysis/data')

# Load Fight Info from CSV (created in fight url scraper)
fight_info = pd.read_csv('fight_urls.csv', encoding='utf-8')
fight_info.drop_duplicates(subset="link", inplace=True)
fight_info.reset_index(drop=True, inplace=True)

# Store urls, locations and titles
fight_urls = fight_info['link'].values.tolist()
fight_titles = fight_info['title'].values.tolist()
fight_locations = fight_info['location'].values.tolist()
fight_dates = fight_info['date'].values.tolist()

# Initialize an empty dataframe
fighter_df = pd.DataFrame(columns=['name', 'kd', 'sig_strikes', 'sig_attempts', 'strikes', 'strike_attempts', 
                                   'takedowns', 'td_attempts', 'sub_attempts', 'pass', 'reversals', 'head', 'head_attempts', 'body', 
                                   'body_attempts','leg', 'leg_attempts', 'distance', 'distance_attempts', 'clinch', 'fight_id',
                                   'clinch_attempts', 'ground', 'ground_attempts', 'win/loss', 'referee', 'round', 'method',
                                   'date', 'location', 'title'])
# Helper function to get the tag text
def nice_text(tag):
    return " ".join(str(tag.get_text()).split())


# Iterate through the fight urls, and pull relevant variables/fields
for i in range(200):
    
    # Store fight event, location and date
    title = fight_titles[i]
    location = fight_locations[i]
    date = fight_dates[i]
    
    sock = urllib.urlopen(fight_urls[i]) # specific URL for a fight
    fight_html = sock.read()
    fight_soup = bs(fight_html, "lxml")
    trs = fight_soup.find_all('tr') # all the tables in each fight URL
    headers = fight_soup.find_all('i')
    bad_call = 0
    
    # Get the name and win/loss status of each fighter
    person_divs = fight_soup.find_all('div', 
                                  class_="b-fight-details__person")
    names = []
    winloss = []
    
    if len(person_divs)<2:
        print i, fight_urls[i], fight_dates[i]
        names = [None, None]
        winloss = [None, None]
        
    for person_div in person_divs:
        i_tag = person_div.find('i')
        try:
            winloss.append(nice_text(i_tag))
        except:
            winloss.append(None)
        h3 = person_div.find('h3')
        try:
            names.append(nice_text(h3))
        except:
            names.append(None)

    name_1, name_2 = names
    winloss_1, winloss_2 = winloss
    
    try: 
        referee = str(headers[24].get_text()).split()[1] + ' ' + str(headers[24].get_text()).split()[-1]
    except:
        referee = None
    try:
        rounds = str(headers[18].get_text()).split()[1]
    except:
        rounds = None
    try:
        method = str(headers[17].get_text()).split()[0]
    except:
        method = None
    try:
        tr1 = str(trs[1].get_text()).split()
        # Find the location of the 2nd table tr2 (it varies)
        j = 0
        while j < 10:
            if str(trs[j].get_text()).split()[6] == 'Head':
                #print j+1
                tr2 = str(trs[j+1].get_text()).split()
                break
            j += 1
        #print tr1; #print tr2
        
        # Test for the end of names
        k = 0
        while k < len(tr1):
            try:
                int(tr1[k])
                break
            except:
                k += 1
                continue
        #print k
    except:
        print fight_dates[i] + ' bad call'
        bad_call += 1
        continue


    # Add each fighter's information to the dataframe
    fighter1 = pd.DataFrame({'name': [name_1], 'kd': tr1[k], 'sig_strikes': tr1[k+2],
    'sig_attempts': tr1[k+4], 'strikes': tr1[k+10], 'strike_attempts': tr1[k+12], 'takedowns': tr1[k+16],'td_attempts': tr1[k+18],
    'sub_attempts': tr1[k+24], 'pass': tr1[k+26], 'reversals': tr1[k+28], 'head': tr2[k+8], 'head_attempts': tr2[k+10],
    'body': tr2[k+14], 'body_attempts': tr2[k+16], 'leg': tr2[k+20], 'leg_attempts': tr2[k+22], 'distance': tr2[k+26],
    'distance_attempts': tr2[k+28], 'clinch': tr2[k+32], 'clinch_attempts': tr2[k+34], 'ground': tr2[k+38], 
    'ground_attempts': tr2[k+40], 'win/loss': winloss_1, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})

    fighter2 = pd.DataFrame({'name': [name_2], 'kd': tr1[k+1], 'sig_strikes': tr1[k+5], 
    'sig_attempts': tr1[k+7], 'strikes': tr1[k+13], 'strike_attempts': tr1[k+15], 'takedowns': tr1[k+19],'td_attempts': tr1[k+21],
    'sub_attempts': tr1[k+25], 'pass': tr1[k+27], 'reversals': tr1[k+29], 'head': tr2[k+11], 'head_attempts': tr2[k+13],
    'body': tr2[k+17], 'body_attempts': tr2[k+19], 'leg': tr2[k+23], 'leg_attempts': tr2[k+25], 'distance': tr2[k+29],
    'distance_attempts': tr2[k+31], 'clinch': tr2[k+35], 'clinch_attempts': tr2[k+37], 'ground': tr2[k+41], 
    'ground_attempts': tr2[k+43], 'win/loss': winloss_2, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})
    
    fighter_df = pd.concat([fighter_df, fighter1, fighter2], axis=0, ignore_index=True)
    
fighter_df.to_csv('chunks/fights0.csv', index=False)

In [2]:
# Initialize an empty dataframe
fighter_df = pd.DataFrame(columns=['name', 'kd', 'sig_strikes', 'sig_attempts', 'strikes', 'strike_attempts', 
                                   'takedowns', 'td_attempts', 'sub_attempts', 'pass', 'reversals', 'head', 'head_attempts', 'body', 
                                   'body_attempts','leg', 'leg_attempts', 'distance', 'distance_attempts', 'clinch', 'fight_id',
                                   'clinch_attempts', 'ground', 'ground_attempts', 'win/loss', 'referee', 'round', 'method',
                                   'date', 'location', 'title'])
# Helper function to get the tag text
def nice_text(tag):
    return " ".join(str(tag.get_text()).split())


# Iterate through the fight urls, and pull relevant variables/fields
for i in range(200, 400):
    
    # Store fight event, location and date
    title = fight_titles[i]
    location = fight_locations[i]
    date = fight_dates[i]
    
    sock = urllib.urlopen(fight_urls[i]) # specific URL for a fight
    fight_html = sock.read()
    fight_soup = bs(fight_html, "lxml")
    trs = fight_soup.find_all('tr') # all the tables in each fight URL
    headers = fight_soup.find_all('i')
    bad_call = 0
    
    # Get the name and win/loss status of each fighter
    person_divs = fight_soup.find_all('div', 
                                  class_="b-fight-details__person")
    names = []
    winloss = []
    
    if len(person_divs)<2:
        print i, fight_urls[i], fight_dates[i]
        names = [None, None]
        winloss = [None, None]
        
    for person_div in person_divs:
        i_tag = person_div.find('i')
        try:
            winloss.append(nice_text(i_tag))
        except:
            winloss.append(None)
        h3 = person_div.find('h3')
        try:
            names.append(nice_text(h3))
        except:
            names.append(None)

    name_1, name_2 = names
    winloss_1, winloss_2 = winloss
    
    try: 
        referee = str(headers[24].get_text()).split()[1] + ' ' + str(headers[24].get_text()).split()[-1]
    except:
        referee = None
    try:
        rounds = str(headers[18].get_text()).split()[1]
    except:
        rounds = None
    try:
        method = str(headers[17].get_text()).split()[0]
    except:
        method = None
    try:
        tr1 = str(trs[1].get_text()).split()
        # Find the location of the 2nd table tr2 (it varies)
        j = 0
        while j < 10:
            if str(trs[j].get_text()).split()[6] == 'Head':
                #print j+1
                tr2 = str(trs[j+1].get_text()).split()
                break
            j += 1
        #print tr1; #print tr2
        
        # Test for the end of names
        k = 0
        while k < len(tr1):
            try:
                int(tr1[k])
                break
            except:
                k += 1
                continue
        #print k
    except:
        print fight_dates[i] + ' bad call'
        bad_call += 1
        continue


    # Add each fighter's information to the dataframe
    fighter1 = pd.DataFrame({'name': [name_1], 'kd': tr1[k], 'sig_strikes': tr1[k+2],
    'sig_attempts': tr1[k+4], 'strikes': tr1[k+10], 'strike_attempts': tr1[k+12], 'takedowns': tr1[k+16],'td_attempts': tr1[k+18],
    'sub_attempts': tr1[k+24], 'pass': tr1[k+26], 'reversals': tr1[k+28], 'head': tr2[k+8], 'head_attempts': tr2[k+10],
    'body': tr2[k+14], 'body_attempts': tr2[k+16], 'leg': tr2[k+20], 'leg_attempts': tr2[k+22], 'distance': tr2[k+26],
    'distance_attempts': tr2[k+28], 'clinch': tr2[k+32], 'clinch_attempts': tr2[k+34], 'ground': tr2[k+38], 
    'ground_attempts': tr2[k+40], 'win/loss': winloss_1, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})

    fighter2 = pd.DataFrame({'name': [name_2], 'kd': tr1[k+1], 'sig_strikes': tr1[k+5], 
    'sig_attempts': tr1[k+7], 'strikes': tr1[k+13], 'strike_attempts': tr1[k+15], 'takedowns': tr1[k+19],'td_attempts': tr1[k+21],
    'sub_attempts': tr1[k+25], 'pass': tr1[k+27], 'reversals': tr1[k+29], 'head': tr2[k+11], 'head_attempts': tr2[k+13],
    'body': tr2[k+17], 'body_attempts': tr2[k+19], 'leg': tr2[k+23], 'leg_attempts': tr2[k+25], 'distance': tr2[k+29],
    'distance_attempts': tr2[k+31], 'clinch': tr2[k+35], 'clinch_attempts': tr2[k+37], 'ground': tr2[k+41], 
    'ground_attempts': tr2[k+43], 'win/loss': winloss_2, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})
    
    fighter_df = pd.concat([fighter_df, fighter1, fighter2], axis=0, ignore_index=True)
    
fighter_df.to_csv('chunks/fights2.csv', index=False)

In [3]:
# Initialize an empty dataframe
fighter_df = pd.DataFrame(columns=['name', 'kd', 'sig_strikes', 'sig_attempts', 'strikes', 'strike_attempts', 
                                   'takedowns', 'td_attempts', 'sub_attempts', 'pass', 'reversals', 'head', 'head_attempts', 'body', 
                                   'body_attempts','leg', 'leg_attempts', 'distance', 'distance_attempts', 'clinch', 'fight_id',
                                   'clinch_attempts', 'ground', 'ground_attempts', 'win/loss', 'referee', 'round', 'method',
                                   'date', 'location', 'title'])

# Iterate through the fight urls, and pull relevant variables/fields
for i in range(400, 600):
    
    # Store fight event, location and date
    title = fight_titles[i]
    location = fight_locations[i]
    date = fight_dates[i]
    
    sock = urllib.urlopen(fight_urls[i]) # specific URL for a fight
    fight_html = sock.read()
    fight_soup = bs(fight_html, "lxml")
    trs = fight_soup.find_all('tr') # all the tables in each fight URL
    headers = fight_soup.find_all('i')
    bad_call = 0
    
    # Get the name and win/loss status of each fighter
    person_divs = fight_soup.find_all('div', 
                                  class_="b-fight-details__person")
    names = []
    winloss = []
    
    if len(person_divs)<2:
        print i, fight_urls[i], fight_dates[i]
        names = [None, None]
        winloss = [None, None]
        
    for person_div in person_divs:
        i_tag = person_div.find('i')
        try:
            winloss.append(nice_text(i_tag))
        except:
            winloss.append(None)
        h3 = person_div.find('h3')
        try:
            names.append(nice_text(h3))
        except:
            names.append(None)

    name_1, name_2 = names
    winloss_1, winloss_2 = winloss
    
    try: 
        referee = str(headers[24].get_text()).split()[1] + ' ' + str(headers[24].get_text()).split()[-1]
    except:
        referee = None
    try:
        rounds = str(headers[18].get_text()).split()[1]
    except:
        rounds = None
    try:
        method = str(headers[17].get_text()).split()[0]
    except:
        method = None
    try:
        tr1 = str(trs[1].get_text()).split()
        # Find the location of the 2nd table tr2 (it varies)
        j = 0
        while j < 10:
            if str(trs[j].get_text()).split()[6] == 'Head':
                #print j+1
                tr2 = str(trs[j+1].get_text()).split()
                break
            j += 1
        #print tr1; #print tr2
        
        # Test for the end of names
        k = 0
        while k < len(tr1):
            try:
                int(tr1[k])
                break
            except:
                k += 1
                continue
        #print k
    except:
        print fight_dates[i] + ' bad call'
        bad_call += 1
        continue


    # Add each fighter's information to the dataframe
    fighter1 = pd.DataFrame({'name': [name_1], 'kd': tr1[k], 'sig_strikes': tr1[k+2],
    'sig_attempts': tr1[k+4], 'strikes': tr1[k+10], 'strike_attempts': tr1[k+12], 'takedowns': tr1[k+16],'td_attempts': tr1[k+18],
    'sub_attempts': tr1[k+24], 'pass': tr1[k+26], 'reversals': tr1[k+28], 'head': tr2[k+8], 'head_attempts': tr2[k+10],
    'body': tr2[k+14], 'body_attempts': tr2[k+16], 'leg': tr2[k+20], 'leg_attempts': tr2[k+22], 'distance': tr2[k+26],
    'distance_attempts': tr2[k+28], 'clinch': tr2[k+32], 'clinch_attempts': tr2[k+34], 'ground': tr2[k+38], 
    'ground_attempts': tr2[k+40], 'win/loss': winloss_1, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})

    fighter2 = pd.DataFrame({'name': [name_2], 'kd': tr1[k+1], 'sig_strikes': tr1[k+5], 
    'sig_attempts': tr1[k+7], 'strikes': tr1[k+13], 'strike_attempts': tr1[k+15], 'takedowns': tr1[k+19],'td_attempts': tr1[k+21],
    'sub_attempts': tr1[k+25], 'pass': tr1[k+27], 'reversals': tr1[k+29], 'head': tr2[k+11], 'head_attempts': tr2[k+13],
    'body': tr2[k+17], 'body_attempts': tr2[k+19], 'leg': tr2[k+23], 'leg_attempts': tr2[k+25], 'distance': tr2[k+29],
    'distance_attempts': tr2[k+31], 'clinch': tr2[k+35], 'clinch_attempts': tr2[k+37], 'ground': tr2[k+41], 
    'ground_attempts': tr2[k+43], 'win/loss': winloss_2, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})
    
    fighter_df = pd.concat([fighter_df, fighter1, fighter2], axis=0, ignore_index=True)
    
fighter_df.to_csv('chunks/fights4.csv', index=False)

In [4]:
# Initialize an empty dataframe
fighter_df = pd.DataFrame(columns=['name', 'kd', 'sig_strikes', 'sig_attempts', 'strikes', 'strike_attempts', 
                                   'takedowns', 'td_attempts', 'sub_attempts', 'pass', 'reversals', 'head', 'head_attempts', 'body', 
                                   'body_attempts','leg', 'leg_attempts', 'distance', 'distance_attempts', 'clinch', 'fight_id',
                                   'clinch_attempts', 'ground', 'ground_attempts', 'win/loss', 'referee', 'round', 'method',
                                   'date', 'location', 'title'])

# Iterate through the fight urls, and pull relevant variables/fields
for i in range(600, 800):
    
    # Store fight event, location and date
    title = fight_titles[i]
    location = fight_locations[i]
    date = fight_dates[i]
    
    sock = urllib.urlopen(fight_urls[i]) # specific URL for a fight
    fight_html = sock.read()
    fight_soup = bs(fight_html, "lxml")
    trs = fight_soup.find_all('tr') # all the tables in each fight URL
    headers = fight_soup.find_all('i')
    bad_call = 0
    
    # Get the name and win/loss status of each fighter
    person_divs = fight_soup.find_all('div', 
                                  class_="b-fight-details__person")
    names = []
    winloss = []
    
    if len(person_divs)<2:
        print i, fight_urls[i], fight_dates[i]
        names = [None, None]
        winloss = [None, None]
        
    for person_div in person_divs:
        i_tag = person_div.find('i')
        try:
            winloss.append(nice_text(i_tag))
        except:
            winloss.append(None)
        h3 = person_div.find('h3')
        try:
            names.append(nice_text(h3))
        except:
            names.append(None)

    name_1, name_2 = names
    winloss_1, winloss_2 = winloss
    
    try: 
        referee = str(headers[24].get_text()).split()[1] + ' ' + str(headers[24].get_text()).split()[-1]
    except:
        referee = None
    try:
        rounds = str(headers[18].get_text()).split()[1]
    except:
        rounds = None
    try:
        method = str(headers[17].get_text()).split()[0]
    except:
        method = None
    try:
        tr1 = str(trs[1].get_text()).split()
        # Find the location of the 2nd table tr2 (it varies)
        j = 0
        while j < 10:
            if str(trs[j].get_text()).split()[6] == 'Head':
                #print j+1
                tr2 = str(trs[j+1].get_text()).split()
                break
            j += 1
        #print tr1; #print tr2
        
        # Test for the end of names
        k = 0
        while k < len(tr1):
            try:
                int(tr1[k])
                break
            except:
                k += 1
                continue
        #print k
    except:
        print fight_dates[i] + ' bad call'
        bad_call += 1
        continue


    # Add each fighter's information to the dataframe
    fighter1 = pd.DataFrame({'name': [name_1], 'kd': tr1[k], 'sig_strikes': tr1[k+2],
    'sig_attempts': tr1[k+4], 'strikes': tr1[k+10], 'strike_attempts': tr1[k+12], 'takedowns': tr1[k+16],'td_attempts': tr1[k+18],
    'sub_attempts': tr1[k+24], 'pass': tr1[k+26], 'reversals': tr1[k+28], 'head': tr2[k+8], 'head_attempts': tr2[k+10],
    'body': tr2[k+14], 'body_attempts': tr2[k+16], 'leg': tr2[k+20], 'leg_attempts': tr2[k+22], 'distance': tr2[k+26],
    'distance_attempts': tr2[k+28], 'clinch': tr2[k+32], 'clinch_attempts': tr2[k+34], 'ground': tr2[k+38], 
    'ground_attempts': tr2[k+40], 'win/loss': winloss_1, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})

    fighter2 = pd.DataFrame({'name': [name_2], 'kd': tr1[k+1], 'sig_strikes': tr1[k+5], 
    'sig_attempts': tr1[k+7], 'strikes': tr1[k+13], 'strike_attempts': tr1[k+15], 'takedowns': tr1[k+19],'td_attempts': tr1[k+21],
    'sub_attempts': tr1[k+25], 'pass': tr1[k+27], 'reversals': tr1[k+29], 'head': tr2[k+11], 'head_attempts': tr2[k+13],
    'body': tr2[k+17], 'body_attempts': tr2[k+19], 'leg': tr2[k+23], 'leg_attempts': tr2[k+25], 'distance': tr2[k+29],
    'distance_attempts': tr2[k+31], 'clinch': tr2[k+35], 'clinch_attempts': tr2[k+37], 'ground': tr2[k+41], 
    'ground_attempts': tr2[k+43], 'win/loss': winloss_2, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})
    
    fighter_df = pd.concat([fighter_df, fighter1, fighter2], axis=0, ignore_index=True)
    
fighter_df.to_csv('chunks/fights6.csv', index=False)

In [5]:
# Initialize an empty dataframe
fighter_df = pd.DataFrame(columns=['name', 'kd', 'sig_strikes', 'sig_attempts', 'strikes', 'strike_attempts', 
                                   'takedowns', 'td_attempts', 'sub_attempts', 'pass', 'reversals', 'head', 'head_attempts', 'body', 
                                   'body_attempts','leg', 'leg_attempts', 'distance', 'distance_attempts', 'clinch', 'fight_id',
                                   'clinch_attempts', 'ground', 'ground_attempts', 'win/loss', 'referee', 'round', 'method',
                                   'date', 'location', 'title'])

# Iterate through the fight urls, and pull relevant variables/fields
for i in range(800, 1000):
    
    # Store fight event, location and date
    title = fight_titles[i]
    location = fight_locations[i]
    date = fight_dates[i]
    
    sock = urllib.urlopen(fight_urls[i]) # specific URL for a fight
    fight_html = sock.read()
    fight_soup = bs(fight_html, "lxml")
    trs = fight_soup.find_all('tr') # all the tables in each fight URL
    headers = fight_soup.find_all('i')
    bad_call = 0
    
    # Get the name and win/loss status of each fighter
    person_divs = fight_soup.find_all('div', 
                                  class_="b-fight-details__person")
    names = []
    winloss = []
    
    if len(person_divs)<2:
        print i, fight_urls[i], fight_dates[i]
        names = [None, None]
        winloss = [None, None]
        
    for person_div in person_divs:
        i_tag = person_div.find('i')
        try:
            winloss.append(nice_text(i_tag))
        except:
            winloss.append(None)
        h3 = person_div.find('h3')
        try:
            names.append(nice_text(h3))
        except:
            names.append(None)

    name_1, name_2 = names
    winloss_1, winloss_2 = winloss
    
    try: 
        referee = str(headers[24].get_text()).split()[1] + ' ' + str(headers[24].get_text()).split()[-1]
    except:
        referee = None
    try:
        rounds = str(headers[18].get_text()).split()[1]
    except:
        rounds = None
    try:
        method = str(headers[17].get_text()).split()[0]
    except:
        method = None
    try:
        tr1 = str(trs[1].get_text()).split()
        # Find the location of the 2nd table tr2 (it varies)
        j = 0
        while j < 10:
            if str(trs[j].get_text()).split()[6] == 'Head':
                #print j+1
                tr2 = str(trs[j+1].get_text()).split()
                break
            j += 1
        #print tr1; #print tr2
        
        # Test for the end of names
        k = 0
        while k < len(tr1):
            try:
                int(tr1[k])
                break
            except:
                k += 1
                continue
        #print k
    except:
        print fight_dates[i] + ' bad call'
        bad_call += 1
        continue


    # Add each fighter's information to the dataframe
    fighter1 = pd.DataFrame({'name': [name_1], 'kd': tr1[k], 'sig_strikes': tr1[k+2],
    'sig_attempts': tr1[k+4], 'strikes': tr1[k+10], 'strike_attempts': tr1[k+12], 'takedowns': tr1[k+16],'td_attempts': tr1[k+18],
    'sub_attempts': tr1[k+24], 'pass': tr1[k+26], 'reversals': tr1[k+28], 'head': tr2[k+8], 'head_attempts': tr2[k+10],
    'body': tr2[k+14], 'body_attempts': tr2[k+16], 'leg': tr2[k+20], 'leg_attempts': tr2[k+22], 'distance': tr2[k+26],
    'distance_attempts': tr2[k+28], 'clinch': tr2[k+32], 'clinch_attempts': tr2[k+34], 'ground': tr2[k+38], 
    'ground_attempts': tr2[k+40], 'win/loss': winloss_1, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})

    fighter2 = pd.DataFrame({'name': [name_2], 'kd': tr1[k+1], 'sig_strikes': tr1[k+5], 
    'sig_attempts': tr1[k+7], 'strikes': tr1[k+13], 'strike_attempts': tr1[k+15], 'takedowns': tr1[k+19],'td_attempts': tr1[k+21],
    'sub_attempts': tr1[k+25], 'pass': tr1[k+27], 'reversals': tr1[k+29], 'head': tr2[k+11], 'head_attempts': tr2[k+13],
    'body': tr2[k+17], 'body_attempts': tr2[k+19], 'leg': tr2[k+23], 'leg_attempts': tr2[k+25], 'distance': tr2[k+29],
    'distance_attempts': tr2[k+31], 'clinch': tr2[k+35], 'clinch_attempts': tr2[k+37], 'ground': tr2[k+41], 
    'ground_attempts': tr2[k+43], 'win/loss': winloss_2, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})
    
    fighter_df = pd.concat([fighter_df, fighter1, fighter2], axis=0, ignore_index=True)
    
fighter_df.to_csv('chunks/fights8.csv', index=False)

In [6]:
# Initialize an empty dataframe
fighter_df = pd.DataFrame(columns=['name', 'kd', 'sig_strikes', 'sig_attempts', 'strikes', 'strike_attempts', 
                                   'takedowns', 'td_attempts', 'sub_attempts', 'pass', 'reversals', 'head', 'head_attempts', 'body', 
                                   'body_attempts','leg', 'leg_attempts', 'distance', 'distance_attempts', 'clinch', 'fight_id',
                                   'clinch_attempts', 'ground', 'ground_attempts', 'win/loss', 'referee', 'round', 'method',
                                   'date', 'location', 'title'])

# Iterate through the fight urls, and pull relevant variables/fields
for i in range(1000, 1200):
    
    # Store fight event, location and date
    title = fight_titles[i]
    location = fight_locations[i]
    date = fight_dates[i]
    
    sock = urllib.urlopen(fight_urls[i]) # specific URL for a fight
    fight_html = sock.read()
    fight_soup = bs(fight_html, "lxml")
    trs = fight_soup.find_all('tr') # all the tables in each fight URL
    headers = fight_soup.find_all('i')
    bad_call = 0
    
    # Get the name and win/loss status of each fighter
    person_divs = fight_soup.find_all('div', 
                                  class_="b-fight-details__person")
    names = []
    winloss = []
    
    if len(person_divs)<2:
        print i, fight_urls[i], fight_dates[i]
        names = [None, None]
        winloss = [None, None]
        
    for person_div in person_divs:
        i_tag = person_div.find('i')
        try:
            winloss.append(nice_text(i_tag))
        except:
            winloss.append(None)
        h3 = person_div.find('h3')
        try:
            names.append(nice_text(h3))
        except:
            names.append(None)

    name_1, name_2 = names
    winloss_1, winloss_2 = winloss
    
    try: 
        referee = str(headers[24].get_text()).split()[1] + ' ' + str(headers[24].get_text()).split()[-1]
    except:
        referee = None
    try:
        rounds = str(headers[18].get_text()).split()[1]
    except:
        rounds = None
    try:
        method = str(headers[17].get_text()).split()[0]
    except:
        method = None
    try:
        tr1 = str(trs[1].get_text()).split()
        # Find the location of the 2nd table tr2 (it varies)
        j = 0
        while j < 10:
            if str(trs[j].get_text()).split()[6] == 'Head':
                #print j+1
                tr2 = str(trs[j+1].get_text()).split()
                break
            j += 1
        #print tr1; #print tr2
        
        # Test for the end of names
        k = 0
        while k < len(tr1):
            try:
                int(tr1[k])
                break
            except:
                k += 1
                continue
        #print k
    except:
        print fight_dates[i] + ' bad call'
        bad_call += 1
        continue


    # Add each fighter's information to the dataframe
    fighter1 = pd.DataFrame({'name': [name_1], 'kd': tr1[k], 'sig_strikes': tr1[k+2],
    'sig_attempts': tr1[k+4], 'strikes': tr1[k+10], 'strike_attempts': tr1[k+12], 'takedowns': tr1[k+16],'td_attempts': tr1[k+18],
    'sub_attempts': tr1[k+24], 'pass': tr1[k+26], 'reversals': tr1[k+28], 'head': tr2[k+8], 'head_attempts': tr2[k+10],
    'body': tr2[k+14], 'body_attempts': tr2[k+16], 'leg': tr2[k+20], 'leg_attempts': tr2[k+22], 'distance': tr2[k+26],
    'distance_attempts': tr2[k+28], 'clinch': tr2[k+32], 'clinch_attempts': tr2[k+34], 'ground': tr2[k+38], 
    'ground_attempts': tr2[k+40], 'win/loss': winloss_1, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})

    fighter2 = pd.DataFrame({'name': [name_2], 'kd': tr1[k+1], 'sig_strikes': tr1[k+5], 
    'sig_attempts': tr1[k+7], 'strikes': tr1[k+13], 'strike_attempts': tr1[k+15], 'takedowns': tr1[k+19],'td_attempts': tr1[k+21],
    'sub_attempts': tr1[k+25], 'pass': tr1[k+27], 'reversals': tr1[k+29], 'head': tr2[k+11], 'head_attempts': tr2[k+13],
    'body': tr2[k+17], 'body_attempts': tr2[k+19], 'leg': tr2[k+23], 'leg_attempts': tr2[k+25], 'distance': tr2[k+29],
    'distance_attempts': tr2[k+31], 'clinch': tr2[k+35], 'clinch_attempts': tr2[k+37], 'ground': tr2[k+41], 
    'ground_attempts': tr2[k+43], 'win/loss': winloss_2, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})
    
    fighter_df = pd.concat([fighter_df, fighter1, fighter2], axis=0, ignore_index=True)
    
fighter_df.to_csv('chunks/fights10.csv', index=False)

In [7]:
# Initialize an empty dataframe
fighter_df = pd.DataFrame(columns=['name', 'kd', 'sig_strikes', 'sig_attempts', 'strikes', 'strike_attempts', 
                                   'takedowns', 'td_attempts', 'sub_attempts', 'pass', 'reversals', 'head', 'head_attempts', 'body', 
                                   'body_attempts','leg', 'leg_attempts', 'distance', 'distance_attempts', 'clinch', 'fight_id',
                                   'clinch_attempts', 'ground', 'ground_attempts', 'win/loss', 'referee', 'round', 'method',
                                   'date', 'location', 'title'])

# Iterate through the fight urls, and pull relevant variables/fields
for i in range(1200, 1400):
    
    # Store fight event, location and date
    title = fight_titles[i]
    location = fight_locations[i]
    date = fight_dates[i]
    
    sock = urllib.urlopen(fight_urls[i]) # specific URL for a fight
    fight_html = sock.read()
    fight_soup = bs(fight_html, "lxml")
    trs = fight_soup.find_all('tr') # all the tables in each fight URL
    headers = fight_soup.find_all('i')
    bad_call = 0
    
    # Get the name and win/loss status of each fighter
    person_divs = fight_soup.find_all('div', 
                                  class_="b-fight-details__person")
    names = []
    winloss = []
    
    if len(person_divs)<2:
        print i, fight_urls[i], fight_dates[i]
        names = [None, None]
        winloss = [None, None]
        
    for person_div in person_divs:
        i_tag = person_div.find('i')
        try:
            winloss.append(nice_text(i_tag))
        except:
            winloss.append(None)
        h3 = person_div.find('h3')
        try:
            names.append(nice_text(h3))
        except:
            names.append(None)

    name_1, name_2 = names
    winloss_1, winloss_2 = winloss
    
    try: 
        referee = str(headers[24].get_text()).split()[1] + ' ' + str(headers[24].get_text()).split()[-1]
    except:
        referee = None
    try:
        rounds = str(headers[18].get_text()).split()[1]
    except:
        rounds = None
    try:
        method = str(headers[17].get_text()).split()[0]
    except:
        method = None
    try:
        tr1 = str(trs[1].get_text()).split()
        # Find the location of the 2nd table tr2 (it varies)
        j = 0
        while j < 10:
            if str(trs[j].get_text()).split()[6] == 'Head':
                #print j+1
                tr2 = str(trs[j+1].get_text()).split()
                break
            j += 1
        #print tr1; #print tr2
        
        # Test for the end of names
        k = 0
        while k < len(tr1):
            try:
                int(tr1[k])
                break
            except:
                k += 1
                continue
        #print k
    except:
        print fight_dates[i] + ' bad call'
        bad_call += 1
        continue


    # Add each fighter's information to the dataframe
    fighter1 = pd.DataFrame({'name': [name_1], 'kd': tr1[k], 'sig_strikes': tr1[k+2],
    'sig_attempts': tr1[k+4], 'strikes': tr1[k+10], 'strike_attempts': tr1[k+12], 'takedowns': tr1[k+16],'td_attempts': tr1[k+18],
    'sub_attempts': tr1[k+24], 'pass': tr1[k+26], 'reversals': tr1[k+28], 'head': tr2[k+8], 'head_attempts': tr2[k+10],
    'body': tr2[k+14], 'body_attempts': tr2[k+16], 'leg': tr2[k+20], 'leg_attempts': tr2[k+22], 'distance': tr2[k+26],
    'distance_attempts': tr2[k+28], 'clinch': tr2[k+32], 'clinch_attempts': tr2[k+34], 'ground': tr2[k+38], 
    'ground_attempts': tr2[k+40], 'win/loss': winloss_1, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})

    fighter2 = pd.DataFrame({'name': [name_2], 'kd': tr1[k+1], 'sig_strikes': tr1[k+5], 
    'sig_attempts': tr1[k+7], 'strikes': tr1[k+13], 'strike_attempts': tr1[k+15], 'takedowns': tr1[k+19],'td_attempts': tr1[k+21],
    'sub_attempts': tr1[k+25], 'pass': tr1[k+27], 'reversals': tr1[k+29], 'head': tr2[k+11], 'head_attempts': tr2[k+13],
    'body': tr2[k+17], 'body_attempts': tr2[k+19], 'leg': tr2[k+23], 'leg_attempts': tr2[k+25], 'distance': tr2[k+29],
    'distance_attempts': tr2[k+31], 'clinch': tr2[k+35], 'clinch_attempts': tr2[k+37], 'ground': tr2[k+41], 
    'ground_attempts': tr2[k+43], 'win/loss': winloss_2, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})
    
    fighter_df = pd.concat([fighter_df, fighter1, fighter2], axis=0, ignore_index=True)
    
fighter_df.to_csv('chunks/fights12.csv', index=False)

In [8]:
# Initialize an empty dataframe
fighter_df = pd.DataFrame(columns=['name', 'kd', 'sig_strikes', 'sig_attempts', 'strikes', 'strike_attempts', 
                                   'takedowns', 'td_attempts', 'sub_attempts', 'pass', 'reversals', 'head', 'head_attempts', 'body', 
                                   'body_attempts','leg', 'leg_attempts', 'distance', 'distance_attempts', 'clinch', 'fight_id',
                                   'clinch_attempts', 'ground', 'ground_attempts', 'win/loss', 'referee', 'round', 'method',
                                   'date', 'location', 'title'])

# Iterate through the fight urls, and pull relevant variables/fields
for i in range(1400, 1600):
    
    # Store fight event, location and date
    title = fight_titles[i]
    location = fight_locations[i]
    date = fight_dates[i]
    
    sock = urllib.urlopen(fight_urls[i]) # specific URL for a fight
    fight_html = sock.read()
    fight_soup = bs(fight_html, "lxml")
    trs = fight_soup.find_all('tr') # all the tables in each fight URL
    headers = fight_soup.find_all('i')
    bad_call = 0
    
    # Get the name and win/loss status of each fighter
    person_divs = fight_soup.find_all('div', 
                                  class_="b-fight-details__person")
    names = []
    winloss = []
    
    if len(person_divs)<2:
        print i, fight_urls[i], fight_dates[i]
        names = [None, None]
        winloss = [None, None]
        
    for person_div in person_divs:
        i_tag = person_div.find('i')
        try:
            winloss.append(nice_text(i_tag))
        except:
            winloss.append(None)
        h3 = person_div.find('h3')
        try:
            names.append(nice_text(h3))
        except:
            names.append(None)

    name_1, name_2 = names
    winloss_1, winloss_2 = winloss
    
    try: 
        referee = str(headers[24].get_text()).split()[1] + ' ' + str(headers[24].get_text()).split()[-1]
    except:
        referee = None
    try:
        rounds = str(headers[18].get_text()).split()[1]
    except:
        rounds = None
    try:
        method = str(headers[17].get_text()).split()[0]
    except:
        method = None
    try:
        tr1 = str(trs[1].get_text()).split()
        # Find the location of the 2nd table tr2 (it varies)
        j = 0
        while j < 10:
            if str(trs[j].get_text()).split()[6] == 'Head':
                #print j+1
                tr2 = str(trs[j+1].get_text()).split()
                break
            j += 1
        #print tr1; #print tr2
        
        # Test for the end of names
        k = 0
        while k < len(tr1):
            try:
                int(tr1[k])
                break
            except:
                k += 1
                continue
        #print k
    except:
        print fight_dates[i] + ' bad call'
        bad_call += 1
        continue


    # Add each fighter's information to the dataframe
    fighter1 = pd.DataFrame({'name': [name_1], 'kd': tr1[k], 'sig_strikes': tr1[k+2],
    'sig_attempts': tr1[k+4], 'strikes': tr1[k+10], 'strike_attempts': tr1[k+12], 'takedowns': tr1[k+16],'td_attempts': tr1[k+18],
    'sub_attempts': tr1[k+24], 'pass': tr1[k+26], 'reversals': tr1[k+28], 'head': tr2[k+8], 'head_attempts': tr2[k+10],
    'body': tr2[k+14], 'body_attempts': tr2[k+16], 'leg': tr2[k+20], 'leg_attempts': tr2[k+22], 'distance': tr2[k+26],
    'distance_attempts': tr2[k+28], 'clinch': tr2[k+32], 'clinch_attempts': tr2[k+34], 'ground': tr2[k+38], 
    'ground_attempts': tr2[k+40], 'win/loss': winloss_1, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})

    fighter2 = pd.DataFrame({'name': [name_2], 'kd': tr1[k+1], 'sig_strikes': tr1[k+5], 
    'sig_attempts': tr1[k+7], 'strikes': tr1[k+13], 'strike_attempts': tr1[k+15], 'takedowns': tr1[k+19],'td_attempts': tr1[k+21],
    'sub_attempts': tr1[k+25], 'pass': tr1[k+27], 'reversals': tr1[k+29], 'head': tr2[k+11], 'head_attempts': tr2[k+13],
    'body': tr2[k+17], 'body_attempts': tr2[k+19], 'leg': tr2[k+23], 'leg_attempts': tr2[k+25], 'distance': tr2[k+29],
    'distance_attempts': tr2[k+31], 'clinch': tr2[k+35], 'clinch_attempts': tr2[k+37], 'ground': tr2[k+41], 
    'ground_attempts': tr2[k+43], 'win/loss': winloss_2, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})
    
    fighter_df = pd.concat([fighter_df, fighter1, fighter2], axis=0, ignore_index=True)
    
fighter_df.to_csv('chunks/fights14.csv', index=False)

In [9]:
# Initialize an empty dataframe
fighter_df = pd.DataFrame(columns=['name', 'kd', 'sig_strikes', 'sig_attempts', 'strikes', 'strike_attempts', 
                                   'takedowns', 'td_attempts', 'sub_attempts', 'pass', 'reversals', 'head', 'head_attempts', 'body', 
                                   'body_attempts','leg', 'leg_attempts', 'distance', 'distance_attempts', 'clinch', 'fight_id',
                                   'clinch_attempts', 'ground', 'ground_attempts', 'win/loss', 'referee', 'round', 'method',
                                   'date', 'location', 'title'])

# Iterate through the fight urls, and pull relevant variables/fields
for i in range(1600, 1800):
    
    # Store fight event, location and date
    title = fight_titles[i]
    location = fight_locations[i]
    date = fight_dates[i]
    
    sock = urllib.urlopen(fight_urls[i]) # specific URL for a fight
    fight_html = sock.read()
    fight_soup = bs(fight_html, "lxml")
    trs = fight_soup.find_all('tr') # all the tables in each fight URL
    headers = fight_soup.find_all('i')
    bad_call = 0
    
    # Get the name and win/loss status of each fighter
    person_divs = fight_soup.find_all('div', 
                                  class_="b-fight-details__person")
    names = []
    winloss = []
    
    if len(person_divs)<2:
        print i, fight_urls[i], fight_dates[i]
        names = [None, None]
        winloss = [None, None]
        
    for person_div in person_divs:
        i_tag = person_div.find('i')
        try:
            winloss.append(nice_text(i_tag))
        except:
            winloss.append(None)
        h3 = person_div.find('h3')
        try:
            names.append(nice_text(h3))
        except:
            names.append(None)

    name_1, name_2 = names
    winloss_1, winloss_2 = winloss
    
    try: 
        referee = str(headers[24].get_text()).split()[1] + ' ' + str(headers[24].get_text()).split()[-1]
    except:
        referee = None
    try:
        rounds = str(headers[18].get_text()).split()[1]
    except:
        rounds = None
    try:
        method = str(headers[17].get_text()).split()[0]
    except:
        method = None
    try:
        tr1 = str(trs[1].get_text()).split()
        # Find the location of the 2nd table tr2 (it varies)
        j = 0
        while j < 10:
            if str(trs[j].get_text()).split()[6] == 'Head':
                #print j+1
                tr2 = str(trs[j+1].get_text()).split()
                break
            j += 1
        #print tr1; #print tr2
        
        # Test for the end of names
        k = 0
        while k < len(tr1):
            try:
                int(tr1[k])
                break
            except:
                k += 1
                continue
        #print k
    except:
        print fight_dates[i] + ' bad call'
        bad_call += 1
        continue


    # Add each fighter's information to the dataframe
    fighter1 = pd.DataFrame({'name': [name_1], 'kd': tr1[k], 'sig_strikes': tr1[k+2],
    'sig_attempts': tr1[k+4], 'strikes': tr1[k+10], 'strike_attempts': tr1[k+12], 'takedowns': tr1[k+16],'td_attempts': tr1[k+18],
    'sub_attempts': tr1[k+24], 'pass': tr1[k+26], 'reversals': tr1[k+28], 'head': tr2[k+8], 'head_attempts': tr2[k+10],
    'body': tr2[k+14], 'body_attempts': tr2[k+16], 'leg': tr2[k+20], 'leg_attempts': tr2[k+22], 'distance': tr2[k+26],
    'distance_attempts': tr2[k+28], 'clinch': tr2[k+32], 'clinch_attempts': tr2[k+34], 'ground': tr2[k+38], 
    'ground_attempts': tr2[k+40], 'win/loss': winloss_1, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})

    fighter2 = pd.DataFrame({'name': [name_2], 'kd': tr1[k+1], 'sig_strikes': tr1[k+5], 
    'sig_attempts': tr1[k+7], 'strikes': tr1[k+13], 'strike_attempts': tr1[k+15], 'takedowns': tr1[k+19],'td_attempts': tr1[k+21],
    'sub_attempts': tr1[k+25], 'pass': tr1[k+27], 'reversals': tr1[k+29], 'head': tr2[k+11], 'head_attempts': tr2[k+13],
    'body': tr2[k+17], 'body_attempts': tr2[k+19], 'leg': tr2[k+23], 'leg_attempts': tr2[k+25], 'distance': tr2[k+29],
    'distance_attempts': tr2[k+31], 'clinch': tr2[k+35], 'clinch_attempts': tr2[k+37], 'ground': tr2[k+41], 
    'ground_attempts': tr2[k+43], 'win/loss': winloss_2, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})
    
    fighter_df = pd.concat([fighter_df, fighter1, fighter2], axis=0, ignore_index=True)
    
fighter_df.to_csv('chunks/fights16.csv', index=False)

In [10]:
# Initialize an empty dataframe
fighter_df = pd.DataFrame(columns=['name', 'kd', 'sig_strikes', 'sig_attempts', 'strikes', 'strike_attempts', 
                                   'takedowns', 'td_attempts', 'sub_attempts', 'pass', 'reversals', 'head', 'head_attempts', 'body', 
                                   'body_attempts','leg', 'leg_attempts', 'distance', 'distance_attempts', 'clinch', 'fight_id',
                                   'clinch_attempts', 'ground', 'ground_attempts', 'win/loss', 'referee', 'round', 'method',
                                   'date', 'location', 'title'])

# Iterate through the fight urls, and pull relevant variables/fields
for i in range(1800, 2000):
    
    # Store fight event, location and date
    title = fight_titles[i]
    location = fight_locations[i]
    date = fight_dates[i]
    
    sock = urllib.urlopen(fight_urls[i]) # specific URL for a fight
    fight_html = sock.read()
    fight_soup = bs(fight_html, "lxml")
    trs = fight_soup.find_all('tr') # all the tables in each fight URL
    headers = fight_soup.find_all('i')
    bad_call = 0
    
    # Get the name and win/loss status of each fighter
    person_divs = fight_soup.find_all('div', 
                                  class_="b-fight-details__person")
    names = []
    winloss = []
    
    if len(person_divs)<2:
        print i, fight_urls[i], fight_dates[i]
        names = [None, None]
        winloss = [None, None]
        
    for person_div in person_divs:
        i_tag = person_div.find('i')
        try:
            winloss.append(nice_text(i_tag))
        except:
            winloss.append(None)
        h3 = person_div.find('h3')
        try:
            names.append(nice_text(h3))
        except:
            names.append(None)

    name_1, name_2 = names
    winloss_1, winloss_2 = winloss
    
    try: 
        referee = str(headers[24].get_text()).split()[1] + ' ' + str(headers[24].get_text()).split()[-1]
    except:
        referee = None
    try:
        rounds = str(headers[18].get_text()).split()[1]
    except:
        rounds = None
    try:
        method = str(headers[17].get_text()).split()[0]
    except:
        method = None
    try:
        tr1 = str(trs[1].get_text()).split()
        # Find the location of the 2nd table tr2 (it varies)
        j = 0
        while j < 10:
            if str(trs[j].get_text()).split()[6] == 'Head':
                #print j+1
                tr2 = str(trs[j+1].get_text()).split()
                break
            j += 1
        #print tr1; #print tr2
        
        # Test for the end of names
        k = 0
        while k < len(tr1):
            try:
                int(tr1[k])
                break
            except:
                k += 1
                continue
        #print k
    except:
        print fight_dates[i] + ' bad call'
        bad_call += 1
        continue


    # Add each fighter's information to the dataframe
    fighter1 = pd.DataFrame({'name': [name_1], 'kd': tr1[k], 'sig_strikes': tr1[k+2],
    'sig_attempts': tr1[k+4], 'strikes': tr1[k+10], 'strike_attempts': tr1[k+12], 'takedowns': tr1[k+16],'td_attempts': tr1[k+18],
    'sub_attempts': tr1[k+24], 'pass': tr1[k+26], 'reversals': tr1[k+28], 'head': tr2[k+8], 'head_attempts': tr2[k+10],
    'body': tr2[k+14], 'body_attempts': tr2[k+16], 'leg': tr2[k+20], 'leg_attempts': tr2[k+22], 'distance': tr2[k+26],
    'distance_attempts': tr2[k+28], 'clinch': tr2[k+32], 'clinch_attempts': tr2[k+34], 'ground': tr2[k+38], 
    'ground_attempts': tr2[k+40], 'win/loss': winloss_1, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})

    fighter2 = pd.DataFrame({'name': [name_2], 'kd': tr1[k+1], 'sig_strikes': tr1[k+5], 
    'sig_attempts': tr1[k+7], 'strikes': tr1[k+13], 'strike_attempts': tr1[k+15], 'takedowns': tr1[k+19],'td_attempts': tr1[k+21],
    'sub_attempts': tr1[k+25], 'pass': tr1[k+27], 'reversals': tr1[k+29], 'head': tr2[k+11], 'head_attempts': tr2[k+13],
    'body': tr2[k+17], 'body_attempts': tr2[k+19], 'leg': tr2[k+23], 'leg_attempts': tr2[k+25], 'distance': tr2[k+29],
    'distance_attempts': tr2[k+31], 'clinch': tr2[k+35], 'clinch_attempts': tr2[k+37], 'ground': tr2[k+41], 
    'ground_attempts': tr2[k+43], 'win/loss': winloss_2, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})
    
    fighter_df = pd.concat([fighter_df, fighter1, fighter2], axis=0, ignore_index=True)
    
fighter_df.to_csv('chunks/fights18.csv', index=False)

In [11]:
# Initialize an empty dataframe
fighter_df = pd.DataFrame(columns=['name', 'kd', 'sig_strikes', 'sig_attempts', 'strikes', 'strike_attempts', 
                                   'takedowns', 'td_attempts', 'sub_attempts', 'pass', 'reversals', 'head', 'head_attempts', 'body', 
                                   'body_attempts','leg', 'leg_attempts', 'distance', 'distance_attempts', 'clinch', 'fight_id',
                                   'clinch_attempts', 'ground', 'ground_attempts', 'win/loss', 'referee', 'round', 'method',
                                   'date', 'location', 'title'])

# Iterate through the fight urls, and pull relevant variables/fields
for i in range(2000, 2200):
    
    # Store fight event, location and date
    title = fight_titles[i]
    location = fight_locations[i]
    date = fight_dates[i]
    
    sock = urllib.urlopen(fight_urls[i]) # specific URL for a fight
    fight_html = sock.read()
    fight_soup = bs(fight_html, "lxml")
    trs = fight_soup.find_all('tr') # all the tables in each fight URL
    headers = fight_soup.find_all('i')
    bad_call = 0
    
    # Get the name and win/loss status of each fighter
    person_divs = fight_soup.find_all('div', 
                                  class_="b-fight-details__person")
    names = []
    winloss = []
    
    if len(person_divs)<2:
        print i, fight_urls[i], fight_dates[i]
        names = [None, None]
        winloss = [None, None]
        
    for person_div in person_divs:
        i_tag = person_div.find('i')
        try:
            winloss.append(nice_text(i_tag))
        except:
            winloss.append(None)
        h3 = person_div.find('h3')
        try:
            names.append(nice_text(h3))
        except:
            names.append(None)

    name_1, name_2 = names
    winloss_1, winloss_2 = winloss
    
    try: 
        referee = str(headers[24].get_text()).split()[1] + ' ' + str(headers[24].get_text()).split()[-1]
    except:
        referee = None
    try:
        rounds = str(headers[18].get_text()).split()[1]
    except:
        rounds = None
    try:
        method = str(headers[17].get_text()).split()[0]
    except:
        method = None
    try:
        tr1 = str(trs[1].get_text()).split()
        # Find the location of the 2nd table tr2 (it varies)
        j = 0
        while j < 10:
            if str(trs[j].get_text()).split()[6] == 'Head':
                #print j+1
                tr2 = str(trs[j+1].get_text()).split()
                break
            j += 1
        #print tr1; #print tr2
        
        # Test for the end of names
        k = 0
        while k < len(tr1):
            try:
                int(tr1[k])
                break
            except:
                k += 1
                continue
        #print k
    except:
        print fight_dates[i] + ' bad call'
        bad_call += 1
        continue


    # Add each fighter's information to the dataframe
    fighter1 = pd.DataFrame({'name': [name_1], 'kd': tr1[k], 'sig_strikes': tr1[k+2],
    'sig_attempts': tr1[k+4], 'strikes': tr1[k+10], 'strike_attempts': tr1[k+12], 'takedowns': tr1[k+16],'td_attempts': tr1[k+18],
    'sub_attempts': tr1[k+24], 'pass': tr1[k+26], 'reversals': tr1[k+28], 'head': tr2[k+8], 'head_attempts': tr2[k+10],
    'body': tr2[k+14], 'body_attempts': tr2[k+16], 'leg': tr2[k+20], 'leg_attempts': tr2[k+22], 'distance': tr2[k+26],
    'distance_attempts': tr2[k+28], 'clinch': tr2[k+32], 'clinch_attempts': tr2[k+34], 'ground': tr2[k+38], 
    'ground_attempts': tr2[k+40], 'win/loss': winloss_1, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})

    fighter2 = pd.DataFrame({'name': [name_2], 'kd': tr1[k+1], 'sig_strikes': tr1[k+5], 
    'sig_attempts': tr1[k+7], 'strikes': tr1[k+13], 'strike_attempts': tr1[k+15], 'takedowns': tr1[k+19],'td_attempts': tr1[k+21],
    'sub_attempts': tr1[k+25], 'pass': tr1[k+27], 'reversals': tr1[k+29], 'head': tr2[k+11], 'head_attempts': tr2[k+13],
    'body': tr2[k+17], 'body_attempts': tr2[k+19], 'leg': tr2[k+23], 'leg_attempts': tr2[k+25], 'distance': tr2[k+29],
    'distance_attempts': tr2[k+31], 'clinch': tr2[k+35], 'clinch_attempts': tr2[k+37], 'ground': tr2[k+41], 
    'ground_attempts': tr2[k+43], 'win/loss': winloss_2, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})
    
    fighter_df = pd.concat([fighter_df, fighter1, fighter2], axis=0, ignore_index=True)
    
fighter_df.to_csv('chunks/fights20.csv', index=False)

In [12]:
# Initialize an empty dataframe
fighter_df = pd.DataFrame(columns=['name', 'kd', 'sig_strikes', 'sig_attempts', 'strikes', 'strike_attempts', 
                                   'takedowns', 'td_attempts', 'sub_attempts', 'pass', 'reversals', 'head', 'head_attempts', 'body', 
                                   'body_attempts','leg', 'leg_attempts', 'distance', 'distance_attempts', 'clinch', 'fight_id',
                                   'clinch_attempts', 'ground', 'ground_attempts', 'win/loss', 'referee', 'round', 'method',
                                   'date', 'location', 'title'])

# Iterate through the fight urls, and pull relevant variables/fields
for i in range(2200, 2400):
    
    # Store fight event, location and date
    title = fight_titles[i]
    location = fight_locations[i]
    date = fight_dates[i]
    
    sock = urllib.urlopen(fight_urls[i]) # specific URL for a fight
    fight_html = sock.read()
    fight_soup = bs(fight_html, "lxml")
    trs = fight_soup.find_all('tr') # all the tables in each fight URL
    headers = fight_soup.find_all('i')
    bad_call = 0
    
    # Get the name and win/loss status of each fighter
    person_divs = fight_soup.find_all('div', 
                                  class_="b-fight-details__person")
    names = []
    winloss = []
    
    if len(person_divs)<2:
        print i, fight_urls[i], fight_dates[i]
        names = [None, None]
        winloss = [None, None]
        
    for person_div in person_divs:
        i_tag = person_div.find('i')
        try:
            winloss.append(nice_text(i_tag))
        except:
            winloss.append(None)
        h3 = person_div.find('h3')
        try:
            names.append(nice_text(h3))
        except:
            names.append(None)

    name_1, name_2 = names
    winloss_1, winloss_2 = winloss
    
    try: 
        referee = str(headers[24].get_text()).split()[1] + ' ' + str(headers[24].get_text()).split()[-1]
    except:
        referee = None
    try:
        rounds = str(headers[18].get_text()).split()[1]
    except:
        rounds = None
    try:
        method = str(headers[17].get_text()).split()[0]
    except:
        method = None
    try:
        tr1 = str(trs[1].get_text()).split()
        # Find the location of the 2nd table tr2 (it varies)
        j = 0
        while j < 10:
            if str(trs[j].get_text()).split()[6] == 'Head':
                #print j+1
                tr2 = str(trs[j+1].get_text()).split()
                break
            j += 1
        #print tr1; #print tr2
        
        # Test for the end of names
        k = 0
        while k < len(tr1):
            try:
                int(tr1[k])
                break
            except:
                k += 1
                continue
        #print k
    except:
        print fight_dates[i] + ' bad call'
        bad_call += 1
        continue


    # Add each fighter's information to the dataframe
    fighter1 = pd.DataFrame({'name': [name_1], 'kd': tr1[k], 'sig_strikes': tr1[k+2],
    'sig_attempts': tr1[k+4], 'strikes': tr1[k+10], 'strike_attempts': tr1[k+12], 'takedowns': tr1[k+16],'td_attempts': tr1[k+18],
    'sub_attempts': tr1[k+24], 'pass': tr1[k+26], 'reversals': tr1[k+28], 'head': tr2[k+8], 'head_attempts': tr2[k+10],
    'body': tr2[k+14], 'body_attempts': tr2[k+16], 'leg': tr2[k+20], 'leg_attempts': tr2[k+22], 'distance': tr2[k+26],
    'distance_attempts': tr2[k+28], 'clinch': tr2[k+32], 'clinch_attempts': tr2[k+34], 'ground': tr2[k+38], 
    'ground_attempts': tr2[k+40], 'win/loss': winloss_1, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})

    fighter2 = pd.DataFrame({'name': [name_2], 'kd': tr1[k+1], 'sig_strikes': tr1[k+5], 
    'sig_attempts': tr1[k+7], 'strikes': tr1[k+13], 'strike_attempts': tr1[k+15], 'takedowns': tr1[k+19],'td_attempts': tr1[k+21],
    'sub_attempts': tr1[k+25], 'pass': tr1[k+27], 'reversals': tr1[k+29], 'head': tr2[k+11], 'head_attempts': tr2[k+13],
    'body': tr2[k+17], 'body_attempts': tr2[k+19], 'leg': tr2[k+23], 'leg_attempts': tr2[k+25], 'distance': tr2[k+29],
    'distance_attempts': tr2[k+31], 'clinch': tr2[k+35], 'clinch_attempts': tr2[k+37], 'ground': tr2[k+41], 
    'ground_attempts': tr2[k+43], 'win/loss': winloss_2, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})
    
    fighter_df = pd.concat([fighter_df, fighter1, fighter2], axis=0, ignore_index=True)
    
fighter_df.to_csv('chunks/fights22.csv', index=False)

In [13]:
# Initialize an empty dataframe
fighter_df = pd.DataFrame(columns=['name', 'kd', 'sig_strikes', 'sig_attempts', 'strikes', 'strike_attempts', 
                                   'takedowns', 'td_attempts', 'sub_attempts', 'pass', 'reversals', 'head', 'head_attempts', 'body', 
                                   'body_attempts','leg', 'leg_attempts', 'distance', 'distance_attempts', 'clinch', 'fight_id',
                                   'clinch_attempts', 'ground', 'ground_attempts', 'win/loss', 'referee', 'round', 'method',
                                   'date', 'location', 'title'])

# Iterate through the fight urls, and pull relevant variables/fields
for i in range(2400, 2600):
    
    # Store fight event, location and date
    title = fight_titles[i]
    location = fight_locations[i]
    date = fight_dates[i]
    
    sock = urllib.urlopen(fight_urls[i]) # specific URL for a fight
    fight_html = sock.read()
    fight_soup = bs(fight_html, "lxml")
    trs = fight_soup.find_all('tr') # all the tables in each fight URL
    headers = fight_soup.find_all('i')
    bad_call = 0
    
    # Get the name and win/loss status of each fighter
    person_divs = fight_soup.find_all('div', 
                                  class_="b-fight-details__person")
    names = []
    winloss = []
    
    if len(person_divs)<2:
        print i, fight_urls[i], fight_dates[i]
        names = [None, None]
        winloss = [None, None]
        
    for person_div in person_divs:
        i_tag = person_div.find('i')
        try:
            winloss.append(nice_text(i_tag))
        except:
            winloss.append(None)
        h3 = person_div.find('h3')
        try:
            names.append(nice_text(h3))
        except:
            names.append(None)

    name_1, name_2 = names
    winloss_1, winloss_2 = winloss
    
    try: 
        referee = str(headers[24].get_text()).split()[1] + ' ' + str(headers[24].get_text()).split()[-1]
    except:
        referee = None
    try:
        rounds = str(headers[18].get_text()).split()[1]
    except:
        rounds = None
    try:
        method = str(headers[17].get_text()).split()[0]
    except:
        method = None
    try:
        tr1 = str(trs[1].get_text()).split()
        # Find the location of the 2nd table tr2 (it varies)
        j = 0
        while j < 10:
            if str(trs[j].get_text()).split()[6] == 'Head':
                #print j+1
                tr2 = str(trs[j+1].get_text()).split()
                break
            j += 1
        #print tr1; #print tr2
        
        # Test for the end of names
        k = 0
        while k < len(tr1):
            try:
                int(tr1[k])
                break
            except:
                k += 1
                continue
        #print k
    except:
        print fight_dates[i] + ' bad call'
        bad_call += 1
        continue


    # Add each fighter's information to the dataframe
    fighter1 = pd.DataFrame({'name': [name_1], 'kd': tr1[k], 'sig_strikes': tr1[k+2],
    'sig_attempts': tr1[k+4], 'strikes': tr1[k+10], 'strike_attempts': tr1[k+12], 'takedowns': tr1[k+16],'td_attempts': tr1[k+18],
    'sub_attempts': tr1[k+24], 'pass': tr1[k+26], 'reversals': tr1[k+28], 'head': tr2[k+8], 'head_attempts': tr2[k+10],
    'body': tr2[k+14], 'body_attempts': tr2[k+16], 'leg': tr2[k+20], 'leg_attempts': tr2[k+22], 'distance': tr2[k+26],
    'distance_attempts': tr2[k+28], 'clinch': tr2[k+32], 'clinch_attempts': tr2[k+34], 'ground': tr2[k+38], 
    'ground_attempts': tr2[k+40], 'win/loss': winloss_1, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})

    fighter2 = pd.DataFrame({'name': [name_2], 'kd': tr1[k+1], 'sig_strikes': tr1[k+5], 
    'sig_attempts': tr1[k+7], 'strikes': tr1[k+13], 'strike_attempts': tr1[k+15], 'takedowns': tr1[k+19],'td_attempts': tr1[k+21],
    'sub_attempts': tr1[k+25], 'pass': tr1[k+27], 'reversals': tr1[k+29], 'head': tr2[k+11], 'head_attempts': tr2[k+13],
    'body': tr2[k+17], 'body_attempts': tr2[k+19], 'leg': tr2[k+23], 'leg_attempts': tr2[k+25], 'distance': tr2[k+29],
    'distance_attempts': tr2[k+31], 'clinch': tr2[k+35], 'clinch_attempts': tr2[k+37], 'ground': tr2[k+41], 
    'ground_attempts': tr2[k+43], 'win/loss': winloss_2, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})
    
    fighter_df = pd.concat([fighter_df, fighter1, fighter2], axis=0, ignore_index=True)
    
fighter_df.to_csv('chunks/fights24.csv', index=False)

In [14]:
# Initialize an empty dataframe
fighter_df = pd.DataFrame(columns=['name', 'kd', 'sig_strikes', 'sig_attempts', 'strikes', 'strike_attempts', 
                                   'takedowns', 'td_attempts', 'sub_attempts', 'pass', 'reversals', 'head', 'head_attempts', 'body', 
                                   'body_attempts','leg', 'leg_attempts', 'distance', 'distance_attempts', 'clinch', 'fight_id',
                                   'clinch_attempts', 'ground', 'ground_attempts', 'win/loss', 'referee', 'round', 'method',
                                   'date', 'location', 'title'])

# Iterate through the fight urls, and pull relevant variables/fields
for i in range(2600, 2800):
    
    # Store fight event, location and date
    title = fight_titles[i]
    location = fight_locations[i]
    date = fight_dates[i]
    
    sock = urllib.urlopen(fight_urls[i]) # specific URL for a fight
    fight_html = sock.read()
    fight_soup = bs(fight_html, "lxml")
    trs = fight_soup.find_all('tr') # all the tables in each fight URL
    headers = fight_soup.find_all('i')
    bad_call = 0
    
    # Get the name and win/loss status of each fighter
    person_divs = fight_soup.find_all('div', 
                                  class_="b-fight-details__person")
    names = []
    winloss = []
    
    if len(person_divs)<2:
        print i, fight_urls[i], fight_dates[i]
        names = [None, None]
        winloss = [None, None]
        
    for person_div in person_divs:
        i_tag = person_div.find('i')
        try:
            winloss.append(nice_text(i_tag))
        except:
            winloss.append(None)
        h3 = person_div.find('h3')
        try:
            names.append(nice_text(h3))
        except:
            names.append(None)

    name_1, name_2 = names
    winloss_1, winloss_2 = winloss
    
    try: 
        referee = str(headers[24].get_text()).split()[1] + ' ' + str(headers[24].get_text()).split()[-1]
    except:
        referee = None
    try:
        rounds = str(headers[18].get_text()).split()[1]
    except:
        rounds = None
    try:
        method = str(headers[17].get_text()).split()[0]
    except:
        method = None
    try:
        tr1 = str(trs[1].get_text()).split()
        # Find the location of the 2nd table tr2 (it varies)
        j = 0
        while j < 10:
            if str(trs[j].get_text()).split()[6] == 'Head':
                #print j+1
                tr2 = str(trs[j+1].get_text()).split()
                break
            j += 1
        #print tr1; #print tr2
        
        # Test for the end of names
        k = 0
        while k < len(tr1):
            try:
                int(tr1[k])
                break
            except:
                k += 1
                continue
        #print k
    except:
        print fight_dates[i] + ' bad call'
        bad_call += 1
        continue


    # Add each fighter's information to the dataframe
    fighter1 = pd.DataFrame({'name': [name_1], 'kd': tr1[k], 'sig_strikes': tr1[k+2],
    'sig_attempts': tr1[k+4], 'strikes': tr1[k+10], 'strike_attempts': tr1[k+12], 'takedowns': tr1[k+16],'td_attempts': tr1[k+18],
    'sub_attempts': tr1[k+24], 'pass': tr1[k+26], 'reversals': tr1[k+28], 'head': tr2[k+8], 'head_attempts': tr2[k+10],
    'body': tr2[k+14], 'body_attempts': tr2[k+16], 'leg': tr2[k+20], 'leg_attempts': tr2[k+22], 'distance': tr2[k+26],
    'distance_attempts': tr2[k+28], 'clinch': tr2[k+32], 'clinch_attempts': tr2[k+34], 'ground': tr2[k+38], 
    'ground_attempts': tr2[k+40], 'win/loss': winloss_1, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})

    fighter2 = pd.DataFrame({'name': [name_2], 'kd': tr1[k+1], 'sig_strikes': tr1[k+5], 
    'sig_attempts': tr1[k+7], 'strikes': tr1[k+13], 'strike_attempts': tr1[k+15], 'takedowns': tr1[k+19],'td_attempts': tr1[k+21],
    'sub_attempts': tr1[k+25], 'pass': tr1[k+27], 'reversals': tr1[k+29], 'head': tr2[k+11], 'head_attempts': tr2[k+13],
    'body': tr2[k+17], 'body_attempts': tr2[k+19], 'leg': tr2[k+23], 'leg_attempts': tr2[k+25], 'distance': tr2[k+29],
    'distance_attempts': tr2[k+31], 'clinch': tr2[k+35], 'clinch_attempts': tr2[k+37], 'ground': tr2[k+41], 
    'ground_attempts': tr2[k+43], 'win/loss': winloss_2, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})
    
    fighter_df = pd.concat([fighter_df, fighter1, fighter2], axis=0, ignore_index=True)
    
fighter_df.to_csv('chunks/fights26.csv', index=False)

In [15]:
# Initialize an empty dataframe
fighter_df = pd.DataFrame(columns=['name', 'kd', 'sig_strikes', 'sig_attempts', 'strikes', 'strike_attempts', 
                                   'takedowns', 'td_attempts', 'sub_attempts', 'pass', 'reversals', 'head', 'head_attempts', 'body', 
                                   'body_attempts','leg', 'leg_attempts', 'distance', 'distance_attempts', 'clinch', 'fight_id',
                                   'clinch_attempts', 'ground', 'ground_attempts', 'win/loss', 'referee', 'round', 'method',
                                   'date', 'location', 'title'])

# Iterate through the fight urls, and pull relevant variables/fields
for i in range(2800, 3000):
    
    # Store fight event, location and date
    title = fight_titles[i]
    location = fight_locations[i]
    date = fight_dates[i]
    
    sock = urllib.urlopen(fight_urls[i]) # specific URL for a fight
    fight_html = sock.read()
    fight_soup = bs(fight_html, "lxml")
    trs = fight_soup.find_all('tr') # all the tables in each fight URL
    headers = fight_soup.find_all('i')
    bad_call = 0
    
    # Get the name and win/loss status of each fighter
    person_divs = fight_soup.find_all('div', 
                                  class_="b-fight-details__person")
    names = []
    winloss = []
    
    if len(person_divs)<2:
        print i, fight_urls[i], fight_dates[i]
        names = [None, None]
        winloss = [None, None]
        
    for person_div in person_divs:
        i_tag = person_div.find('i')
        try:
            winloss.append(nice_text(i_tag))
        except:
            winloss.append(None)
        h3 = person_div.find('h3')
        try:
            names.append(nice_text(h3))
        except:
            names.append(None)

    name_1, name_2 = names
    winloss_1, winloss_2 = winloss
    
    try: 
        referee = str(headers[24].get_text()).split()[1] + ' ' + str(headers[24].get_text()).split()[-1]
    except:
        referee = None
    try:
        rounds = str(headers[18].get_text()).split()[1]
    except:
        rounds = None
    try:
        method = str(headers[17].get_text()).split()[0]
    except:
        method = None
    try:
        tr1 = str(trs[1].get_text()).split()
        # Find the location of the 2nd table tr2 (it varies)
        j = 0
        while j < 10:
            if str(trs[j].get_text()).split()[6] == 'Head':
                #print j+1
                tr2 = str(trs[j+1].get_text()).split()
                break
            j += 1
        #print tr1; #print tr2
        
        # Test for the end of names
        k = 0
        while k < len(tr1):
            try:
                int(tr1[k])
                break
            except:
                k += 1
                continue
        #print k
    except:
        print fight_dates[i] + ' bad call'
        bad_call += 1
        continue


    # Add each fighter's information to the dataframe
    fighter1 = pd.DataFrame({'name': [name_1], 'kd': tr1[k], 'sig_strikes': tr1[k+2],
    'sig_attempts': tr1[k+4], 'strikes': tr1[k+10], 'strike_attempts': tr1[k+12], 'takedowns': tr1[k+16],'td_attempts': tr1[k+18],
    'sub_attempts': tr1[k+24], 'pass': tr1[k+26], 'reversals': tr1[k+28], 'head': tr2[k+8], 'head_attempts': tr2[k+10],
    'body': tr2[k+14], 'body_attempts': tr2[k+16], 'leg': tr2[k+20], 'leg_attempts': tr2[k+22], 'distance': tr2[k+26],
    'distance_attempts': tr2[k+28], 'clinch': tr2[k+32], 'clinch_attempts': tr2[k+34], 'ground': tr2[k+38], 
    'ground_attempts': tr2[k+40], 'win/loss': winloss_1, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})

    fighter2 = pd.DataFrame({'name': [name_2], 'kd': tr1[k+1], 'sig_strikes': tr1[k+5], 
    'sig_attempts': tr1[k+7], 'strikes': tr1[k+13], 'strike_attempts': tr1[k+15], 'takedowns': tr1[k+19],'td_attempts': tr1[k+21],
    'sub_attempts': tr1[k+25], 'pass': tr1[k+27], 'reversals': tr1[k+29], 'head': tr2[k+11], 'head_attempts': tr2[k+13],
    'body': tr2[k+17], 'body_attempts': tr2[k+19], 'leg': tr2[k+23], 'leg_attempts': tr2[k+25], 'distance': tr2[k+29],
    'distance_attempts': tr2[k+31], 'clinch': tr2[k+35], 'clinch_attempts': tr2[k+37], 'ground': tr2[k+41], 
    'ground_attempts': tr2[k+43], 'win/loss': winloss_2, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})
    
    fighter_df = pd.concat([fighter_df, fighter1, fighter2], axis=0, ignore_index=True)
    
fighter_df.to_csv('chunks/fights28.csv', index=False)

In [16]:
# Initialize an empty dataframe
fighter_df = pd.DataFrame(columns=['name', 'kd', 'sig_strikes', 'sig_attempts', 'strikes', 'strike_attempts', 
                                   'takedowns', 'td_attempts', 'sub_attempts', 'pass', 'reversals', 'head', 'head_attempts', 'body', 
                                   'body_attempts','leg', 'leg_attempts', 'distance', 'distance_attempts', 'clinch', 'fight_id',
                                   'clinch_attempts', 'ground', 'ground_attempts', 'win/loss', 'referee', 'round', 'method',
                                   'date', 'location', 'title'])

# Iterate through the fight urls, and pull relevant variables/fields
for i in range(3000, 3200):
    
    # Store fight event, location and date
    title = fight_titles[i]
    location = fight_locations[i]
    date = fight_dates[i]
    
    sock = urllib.urlopen(fight_urls[i]) # specific URL for a fight
    fight_html = sock.read()
    fight_soup = bs(fight_html, "lxml")
    trs = fight_soup.find_all('tr') # all the tables in each fight URL
    headers = fight_soup.find_all('i')
    bad_call = 0
    
    # Get the name and win/loss status of each fighter
    person_divs = fight_soup.find_all('div', 
                                  class_="b-fight-details__person")
    names = []
    winloss = []
    
    if len(person_divs)<2:
        print i, fight_urls[i], fight_dates[i]
        names = [None, None]
        winloss = [None, None]
        
    for person_div in person_divs:
        i_tag = person_div.find('i')
        try:
            winloss.append(nice_text(i_tag))
        except:
            winloss.append(None)
        h3 = person_div.find('h3')
        try:
            names.append(nice_text(h3))
        except:
            names.append(None)

    name_1, name_2 = names
    winloss_1, winloss_2 = winloss
    
    try: 
        referee = str(headers[24].get_text()).split()[1] + ' ' + str(headers[24].get_text()).split()[-1]
    except:
        referee = None
    try:
        rounds = str(headers[18].get_text()).split()[1]
    except:
        rounds = None
    try:
        method = str(headers[17].get_text()).split()[0]
    except:
        method = None
    try:
        tr1 = str(trs[1].get_text()).split()
        # Find the location of the 2nd table tr2 (it varies)
        j = 0
        while j < 10:
            if str(trs[j].get_text()).split()[6] == 'Head':
                #print j+1
                tr2 = str(trs[j+1].get_text()).split()
                break
            j += 1
        #print tr1; #print tr2
        
        # Test for the end of names
        k = 0
        while k < len(tr1):
            try:
                int(tr1[k])
                break
            except:
                k += 1
                continue
        #print k
    except:
        print fight_dates[i] + ' bad call'
        bad_call += 1
        continue


    # Add each fighter's information to the dataframe
    fighter1 = pd.DataFrame({'name': [name_1], 'kd': tr1[k], 'sig_strikes': tr1[k+2],
    'sig_attempts': tr1[k+4], 'strikes': tr1[k+10], 'strike_attempts': tr1[k+12], 'takedowns': tr1[k+16],'td_attempts': tr1[k+18],
    'sub_attempts': tr1[k+24], 'pass': tr1[k+26], 'reversals': tr1[k+28], 'head': tr2[k+8], 'head_attempts': tr2[k+10],
    'body': tr2[k+14], 'body_attempts': tr2[k+16], 'leg': tr2[k+20], 'leg_attempts': tr2[k+22], 'distance': tr2[k+26],
    'distance_attempts': tr2[k+28], 'clinch': tr2[k+32], 'clinch_attempts': tr2[k+34], 'ground': tr2[k+38], 
    'ground_attempts': tr2[k+40], 'win/loss': winloss_1, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})

    fighter2 = pd.DataFrame({'name': [name_2], 'kd': tr1[k+1], 'sig_strikes': tr1[k+5], 
    'sig_attempts': tr1[k+7], 'strikes': tr1[k+13], 'strike_attempts': tr1[k+15], 'takedowns': tr1[k+19],'td_attempts': tr1[k+21],
    'sub_attempts': tr1[k+25], 'pass': tr1[k+27], 'reversals': tr1[k+29], 'head': tr2[k+11], 'head_attempts': tr2[k+13],
    'body': tr2[k+17], 'body_attempts': tr2[k+19], 'leg': tr2[k+23], 'leg_attempts': tr2[k+25], 'distance': tr2[k+29],
    'distance_attempts': tr2[k+31], 'clinch': tr2[k+35], 'clinch_attempts': tr2[k+37], 'ground': tr2[k+41], 
    'ground_attempts': tr2[k+43], 'win/loss': winloss_2, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})
    
    fighter_df = pd.concat([fighter_df, fighter1, fighter2], axis=0, ignore_index=True)
    
fighter_df.to_csv('chunks/fights30.csv', index=False)

In [17]:
# Initialize an empty dataframe
fighter_df = pd.DataFrame(columns=['name', 'kd', 'sig_strikes', 'sig_attempts', 'strikes', 'strike_attempts', 
                                   'takedowns', 'td_attempts', 'sub_attempts', 'pass', 'reversals', 'head', 'head_attempts', 'body', 
                                   'body_attempts','leg', 'leg_attempts', 'distance', 'distance_attempts', 'clinch', 'fight_id',
                                   'clinch_attempts', 'ground', 'ground_attempts', 'win/loss', 'referee', 'round', 'method',
                                   'date', 'location', 'title'])

# Iterate through the fight urls, and pull relevant variables/fields
for i in range(3200, 3400):
    
    # Store fight event, location and date
    title = fight_titles[i]
    location = fight_locations[i]
    date = fight_dates[i]
    
    sock = urllib.urlopen(fight_urls[i]) # specific URL for a fight
    fight_html = sock.read()
    fight_soup = bs(fight_html, "lxml")
    trs = fight_soup.find_all('tr') # all the tables in each fight URL
    headers = fight_soup.find_all('i')
    bad_call = 0
    
    # Get the name and win/loss status of each fighter
    person_divs = fight_soup.find_all('div', 
                                  class_="b-fight-details__person")
    names = []
    winloss = []
    
    if len(person_divs)<2:
        print i, fight_urls[i], fight_dates[i]
        names = [None, None]
        winloss = [None, None]
        
    for person_div in person_divs:
        i_tag = person_div.find('i')
        try:
            winloss.append(nice_text(i_tag))
        except:
            winloss.append(None)
        h3 = person_div.find('h3')
        try:
            names.append(nice_text(h3))
        except:
            names.append(None)

    name_1, name_2 = names
    winloss_1, winloss_2 = winloss
    
    try: 
        referee = str(headers[24].get_text()).split()[1] + ' ' + str(headers[24].get_text()).split()[-1]
    except:
        referee = None
    try:
        rounds = str(headers[18].get_text()).split()[1]
    except:
        rounds = None
    try:
        method = str(headers[17].get_text()).split()[0]
    except:
        method = None
    try:
        tr1 = str(trs[1].get_text()).split()
        # Find the location of the 2nd table tr2 (it varies)
        j = 0
        while j < 10:
            if str(trs[j].get_text()).split()[6] == 'Head':
                #print j+1
                tr2 = str(trs[j+1].get_text()).split()
                break
            j += 1
        #print tr1; #print tr2
        
        # Test for the end of names
        k = 0
        while k < len(tr1):
            try:
                int(tr1[k])
                break
            except:
                k += 1
                continue
        #print k
    except:
        print fight_dates[i] + ' bad call'
        bad_call += 1
        continue


    # Add each fighter's information to the dataframe
    fighter1 = pd.DataFrame({'name': [name_1], 'kd': tr1[k], 'sig_strikes': tr1[k+2],
    'sig_attempts': tr1[k+4], 'strikes': tr1[k+10], 'strike_attempts': tr1[k+12], 'takedowns': tr1[k+16],'td_attempts': tr1[k+18],
    'sub_attempts': tr1[k+24], 'pass': tr1[k+26], 'reversals': tr1[k+28], 'head': tr2[k+8], 'head_attempts': tr2[k+10],
    'body': tr2[k+14], 'body_attempts': tr2[k+16], 'leg': tr2[k+20], 'leg_attempts': tr2[k+22], 'distance': tr2[k+26],
    'distance_attempts': tr2[k+28], 'clinch': tr2[k+32], 'clinch_attempts': tr2[k+34], 'ground': tr2[k+38], 
    'ground_attempts': tr2[k+40], 'win/loss': winloss_1, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})

    fighter2 = pd.DataFrame({'name': [name_2], 'kd': tr1[k+1], 'sig_strikes': tr1[k+5], 
    'sig_attempts': tr1[k+7], 'strikes': tr1[k+13], 'strike_attempts': tr1[k+15], 'takedowns': tr1[k+19],'td_attempts': tr1[k+21],
    'sub_attempts': tr1[k+25], 'pass': tr1[k+27], 'reversals': tr1[k+29], 'head': tr2[k+11], 'head_attempts': tr2[k+13],
    'body': tr2[k+17], 'body_attempts': tr2[k+19], 'leg': tr2[k+23], 'leg_attempts': tr2[k+25], 'distance': tr2[k+29],
    'distance_attempts': tr2[k+31], 'clinch': tr2[k+35], 'clinch_attempts': tr2[k+37], 'ground': tr2[k+41], 
    'ground_attempts': tr2[k+43], 'win/loss': winloss_2, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})
    
    fighter_df = pd.concat([fighter_df, fighter1, fighter2], axis=0, ignore_index=True)
    
fighter_df.to_csv('chunks/fights32.csv', index=False)

In [18]:
# Initialize an empty dataframe
fighter_df = pd.DataFrame(columns=['name', 'kd', 'sig_strikes', 'sig_attempts', 'strikes', 'strike_attempts', 
                                   'takedowns', 'td_attempts', 'sub_attempts', 'pass', 'reversals', 'head', 'head_attempts', 'body', 
                                   'body_attempts','leg', 'leg_attempts', 'distance', 'distance_attempts', 'clinch', 'fight_id',
                                   'clinch_attempts', 'ground', 'ground_attempts', 'win/loss', 'referee', 'round', 'method',
                                   'date', 'location', 'title'])

# Iterate through the fight urls, and pull relevant variables/fields
for i in range(3400, 3600):
    
    # Store fight event, location and date
    title = fight_titles[i]
    location = fight_locations[i]
    date = fight_dates[i]
    
    sock = urllib.urlopen(fight_urls[i]) # specific URL for a fight
    fight_html = sock.read()
    fight_soup = bs(fight_html, "lxml")
    trs = fight_soup.find_all('tr') # all the tables in each fight URL
    headers = fight_soup.find_all('i')
    bad_call = 0
    
    # Get the name and win/loss status of each fighter
    person_divs = fight_soup.find_all('div', 
                                  class_="b-fight-details__person")
    names = []
    winloss = []
    
    if len(person_divs)<2:
        print i, fight_urls[i], fight_dates[i]
        names = [None, None]
        winloss = [None, None]
        
    for person_div in person_divs:
        i_tag = person_div.find('i')
        try:
            winloss.append(nice_text(i_tag))
        except:
            winloss.append(None)
        h3 = person_div.find('h3')
        try:
            names.append(nice_text(h3))
        except:
            names.append(None)

    name_1, name_2 = names
    winloss_1, winloss_2 = winloss
    
    try: 
        referee = str(headers[24].get_text()).split()[1] + ' ' + str(headers[24].get_text()).split()[-1]
    except:
        referee = None
    try:
        rounds = str(headers[18].get_text()).split()[1]
    except:
        rounds = None
    try:
        method = str(headers[17].get_text()).split()[0]
    except:
        method = None
    try:
        tr1 = str(trs[1].get_text()).split()
        # Find the location of the 2nd table tr2 (it varies)
        j = 0
        while j < 10:
            if str(trs[j].get_text()).split()[6] == 'Head':
                #print j+1
                tr2 = str(trs[j+1].get_text()).split()
                break
            j += 1
        #print tr1; #print tr2
        
        # Test for the end of names
        k = 0
        while k < len(tr1):
            try:
                int(tr1[k])
                break
            except:
                k += 1
                continue
        #print k
    except:
        print fight_dates[i] + ' bad call'
        bad_call += 1
        continue


    # Add each fighter's information to the dataframe
    fighter1 = pd.DataFrame({'name': [name_1], 'kd': tr1[k], 'sig_strikes': tr1[k+2],
    'sig_attempts': tr1[k+4], 'strikes': tr1[k+10], 'strike_attempts': tr1[k+12], 'takedowns': tr1[k+16],'td_attempts': tr1[k+18],
    'sub_attempts': tr1[k+24], 'pass': tr1[k+26], 'reversals': tr1[k+28], 'head': tr2[k+8], 'head_attempts': tr2[k+10],
    'body': tr2[k+14], 'body_attempts': tr2[k+16], 'leg': tr2[k+20], 'leg_attempts': tr2[k+22], 'distance': tr2[k+26],
    'distance_attempts': tr2[k+28], 'clinch': tr2[k+32], 'clinch_attempts': tr2[k+34], 'ground': tr2[k+38], 
    'ground_attempts': tr2[k+40], 'win/loss': winloss_1, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})

    fighter2 = pd.DataFrame({'name': [name_2], 'kd': tr1[k+1], 'sig_strikes': tr1[k+5], 
    'sig_attempts': tr1[k+7], 'strikes': tr1[k+13], 'strike_attempts': tr1[k+15], 'takedowns': tr1[k+19],'td_attempts': tr1[k+21],
    'sub_attempts': tr1[k+25], 'pass': tr1[k+27], 'reversals': tr1[k+29], 'head': tr2[k+11], 'head_attempts': tr2[k+13],
    'body': tr2[k+17], 'body_attempts': tr2[k+19], 'leg': tr2[k+23], 'leg_attempts': tr2[k+25], 'distance': tr2[k+29],
    'distance_attempts': tr2[k+31], 'clinch': tr2[k+35], 'clinch_attempts': tr2[k+37], 'ground': tr2[k+41], 
    'ground_attempts': tr2[k+43], 'win/loss': winloss_2, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})
    
    fighter_df = pd.concat([fighter_df, fighter1, fighter2], axis=0, ignore_index=True)
    
fighter_df.to_csv('chunks/fights34.csv', index=False)

In [19]:
# Initialize an empty dataframe
fighter_df = pd.DataFrame(columns=['name', 'kd', 'sig_strikes', 'sig_attempts', 'strikes', 'strike_attempts', 
                                   'takedowns', 'td_attempts', 'sub_attempts', 'pass', 'reversals', 'head', 'head_attempts', 'body', 
                                   'body_attempts','leg', 'leg_attempts', 'distance', 'distance_attempts', 'clinch', 'fight_id',
                                   'clinch_attempts', 'ground', 'ground_attempts', 'win/loss', 'referee', 'round', 'method',
                                   'date', 'location', 'title'])

# Iterate through the fight urls, and pull relevant variables/fields
for i in range(3600, 3800):
    
    # Store fight event, location and date
    title = fight_titles[i]
    location = fight_locations[i]
    date = fight_dates[i]
    
    sock = urllib.urlopen(fight_urls[i]) # specific URL for a fight
    fight_html = sock.read()
    fight_soup = bs(fight_html, "lxml")
    trs = fight_soup.find_all('tr') # all the tables in each fight URL
    headers = fight_soup.find_all('i')
    bad_call = 0
    
    # Get the name and win/loss status of each fighter
    person_divs = fight_soup.find_all('div', 
                                  class_="b-fight-details__person")
    names = []
    winloss = []
    
    if len(person_divs)<2:
        print i, fight_urls[i], fight_dates[i]
        names = [None, None]
        winloss = [None, None]
        
    for person_div in person_divs:
        i_tag = person_div.find('i')
        try:
            winloss.append(nice_text(i_tag))
        except:
            winloss.append(None)
        h3 = person_div.find('h3')
        try:
            names.append(nice_text(h3))
        except:
            names.append(None)

    name_1, name_2 = names
    winloss_1, winloss_2 = winloss
    
    try: 
        referee = str(headers[24].get_text()).split()[1] + ' ' + str(headers[24].get_text()).split()[-1]
    except:
        referee = None
    try:
        rounds = str(headers[18].get_text()).split()[1]
    except:
        rounds = None
    try:
        method = str(headers[17].get_text()).split()[0]
    except:
        method = None
    try:
        tr1 = str(trs[1].get_text()).split()
        # Find the location of the 2nd table tr2 (it varies)
        j = 0
        while j < 10:
            if str(trs[j].get_text()).split()[6] == 'Head':
                #print j+1
                tr2 = str(trs[j+1].get_text()).split()
                break
            j += 1
        #print tr1; #print tr2
        
        # Test for the end of names
        k = 0
        while k < len(tr1):
            try:
                int(tr1[k])
                break
            except:
                k += 1
                continue
        #print k
    except:
        print fight_dates[i] + ' bad call'
        bad_call += 1
        continue


    # Add each fighter's information to the dataframe
    fighter1 = pd.DataFrame({'name': [name_1], 'kd': tr1[k], 'sig_strikes': tr1[k+2],
    'sig_attempts': tr1[k+4], 'strikes': tr1[k+10], 'strike_attempts': tr1[k+12], 'takedowns': tr1[k+16],'td_attempts': tr1[k+18],
    'sub_attempts': tr1[k+24], 'pass': tr1[k+26], 'reversals': tr1[k+28], 'head': tr2[k+8], 'head_attempts': tr2[k+10],
    'body': tr2[k+14], 'body_attempts': tr2[k+16], 'leg': tr2[k+20], 'leg_attempts': tr2[k+22], 'distance': tr2[k+26],
    'distance_attempts': tr2[k+28], 'clinch': tr2[k+32], 'clinch_attempts': tr2[k+34], 'ground': tr2[k+38], 
    'ground_attempts': tr2[k+40], 'win/loss': winloss_1, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})

    fighter2 = pd.DataFrame({'name': [name_2], 'kd': tr1[k+1], 'sig_strikes': tr1[k+5], 
    'sig_attempts': tr1[k+7], 'strikes': tr1[k+13], 'strike_attempts': tr1[k+15], 'takedowns': tr1[k+19],'td_attempts': tr1[k+21],
    'sub_attempts': tr1[k+25], 'pass': tr1[k+27], 'reversals': tr1[k+29], 'head': tr2[k+11], 'head_attempts': tr2[k+13],
    'body': tr2[k+17], 'body_attempts': tr2[k+19], 'leg': tr2[k+23], 'leg_attempts': tr2[k+25], 'distance': tr2[k+29],
    'distance_attempts': tr2[k+31], 'clinch': tr2[k+35], 'clinch_attempts': tr2[k+37], 'ground': tr2[k+41], 
    'ground_attempts': tr2[k+43], 'win/loss': winloss_2, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})
    
    fighter_df = pd.concat([fighter_df, fighter1, fighter2], axis=0, ignore_index=True)
    
fighter_df.to_csv('chunks/fights36.csv', index=False)

In [20]:
# Initialize an empty dataframe
fighter_df = pd.DataFrame(columns=['name', 'kd', 'sig_strikes', 'sig_attempts', 'strikes', 'strike_attempts', 
                                   'takedowns', 'td_attempts', 'sub_attempts', 'pass', 'reversals', 'head', 'head_attempts', 'body', 
                                   'body_attempts','leg', 'leg_attempts', 'distance', 'distance_attempts', 'clinch', 'fight_id',
                                   'clinch_attempts', 'ground', 'ground_attempts', 'win/loss', 'referee', 'round', 'method',
                                   'date', 'location', 'title'])

# Iterate through the fight urls, and pull relevant variables/fields
for i in range(3800, 4000):
    
    # Store fight event, location and date
    title = fight_titles[i]
    location = fight_locations[i]
    date = fight_dates[i]
    
    sock = urllib.urlopen(fight_urls[i]) # specific URL for a fight
    fight_html = sock.read()
    fight_soup = bs(fight_html, "lxml")
    trs = fight_soup.find_all('tr') # all the tables in each fight URL
    headers = fight_soup.find_all('i')
    bad_call = 0
    
    # Get the name and win/loss status of each fighter
    person_divs = fight_soup.find_all('div', 
                                  class_="b-fight-details__person")
    names = []
    winloss = []
    
    if len(person_divs)<2:
        print i, fight_urls[i], fight_dates[i]
        names = [None, None]
        winloss = [None, None]
        
    for person_div in person_divs:
        i_tag = person_div.find('i')
        try:
            winloss.append(nice_text(i_tag))
        except:
            winloss.append(None)
        h3 = person_div.find('h3')
        try:
            names.append(nice_text(h3))
        except:
            names.append(None)

    name_1, name_2 = names
    winloss_1, winloss_2 = winloss
    
    try: 
        referee = str(headers[24].get_text()).split()[1] + ' ' + str(headers[24].get_text()).split()[-1]
    except:
        referee = None
    try:
        rounds = str(headers[18].get_text()).split()[1]
    except:
        rounds = None
    try:
        method = str(headers[17].get_text()).split()[0]
    except:
        method = None
    try:
        tr1 = str(trs[1].get_text()).split()
        # Find the location of the 2nd table tr2 (it varies)
        j = 0
        while j < 10:
            if str(trs[j].get_text()).split()[6] == 'Head':
                #print j+1
                tr2 = str(trs[j+1].get_text()).split()
                break
            j += 1
        #print tr1; #print tr2
        
        # Test for the end of names
        k = 0
        while k < len(tr1):
            try:
                int(tr1[k])
                break
            except:
                k += 1
                continue
        #print k
    except:
        print fight_dates[i] + ' bad call'
        bad_call += 1
        continue


    # Add each fighter's information to the dataframe
    fighter1 = pd.DataFrame({'name': [name_1], 'kd': tr1[k], 'sig_strikes': tr1[k+2],
    'sig_attempts': tr1[k+4], 'strikes': tr1[k+10], 'strike_attempts': tr1[k+12], 'takedowns': tr1[k+16],'td_attempts': tr1[k+18],
    'sub_attempts': tr1[k+24], 'pass': tr1[k+26], 'reversals': tr1[k+28], 'head': tr2[k+8], 'head_attempts': tr2[k+10],
    'body': tr2[k+14], 'body_attempts': tr2[k+16], 'leg': tr2[k+20], 'leg_attempts': tr2[k+22], 'distance': tr2[k+26],
    'distance_attempts': tr2[k+28], 'clinch': tr2[k+32], 'clinch_attempts': tr2[k+34], 'ground': tr2[k+38], 
    'ground_attempts': tr2[k+40], 'win/loss': winloss_1, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})

    fighter2 = pd.DataFrame({'name': [name_2], 'kd': tr1[k+1], 'sig_strikes': tr1[k+5], 
    'sig_attempts': tr1[k+7], 'strikes': tr1[k+13], 'strike_attempts': tr1[k+15], 'takedowns': tr1[k+19],'td_attempts': tr1[k+21],
    'sub_attempts': tr1[k+25], 'pass': tr1[k+27], 'reversals': tr1[k+29], 'head': tr2[k+11], 'head_attempts': tr2[k+13],
    'body': tr2[k+17], 'body_attempts': tr2[k+19], 'leg': tr2[k+23], 'leg_attempts': tr2[k+25], 'distance': tr2[k+29],
    'distance_attempts': tr2[k+31], 'clinch': tr2[k+35], 'clinch_attempts': tr2[k+37], 'ground': tr2[k+41], 
    'ground_attempts': tr2[k+43], 'win/loss': winloss_2, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})
    
    fighter_df = pd.concat([fighter_df, fighter1, fighter2], axis=0, ignore_index=True)
    
fighter_df.to_csv('chunks/fights38.csv', index=False)

In [21]:
# Initialize an empty dataframe
fighter_df = pd.DataFrame(columns=['name', 'kd', 'sig_strikes', 'sig_attempts', 'strikes', 'strike_attempts', 
                                   'takedowns', 'td_attempts', 'sub_attempts', 'pass', 'reversals', 'head', 'head_attempts', 'body', 
                                   'body_attempts','leg', 'leg_attempts', 'distance', 'distance_attempts', 'clinch', 'fight_id',
                                   'clinch_attempts', 'ground', 'ground_attempts', 'win/loss', 'referee', 'round', 'method',
                                   'date', 'location', 'title'])

# Iterate through the fight urls, and pull relevant variables/fields
for i in range(4000, 4200):
    
    # Store fight event, location and date
    title = fight_titles[i]
    location = fight_locations[i]
    date = fight_dates[i]
    
    sock = urllib.urlopen(fight_urls[i]) # specific URL for a fight
    fight_html = sock.read()
    fight_soup = bs(fight_html, "lxml")
    trs = fight_soup.find_all('tr') # all the tables in each fight URL
    headers = fight_soup.find_all('i')
    bad_call = 0
    
    # Get the name and win/loss status of each fighter
    person_divs = fight_soup.find_all('div', 
                                  class_="b-fight-details__person")
    names = []
    winloss = []
    
    if len(person_divs)<2:
        print i, fight_urls[i], fight_dates[i]
        names = [None, None]
        winloss = [None, None]
        
    for person_div in person_divs:
        i_tag = person_div.find('i')
        try:
            winloss.append(nice_text(i_tag))
        except:
            winloss.append(None)
        h3 = person_div.find('h3')
        try:
            names.append(nice_text(h3))
        except:
            names.append(None)

    name_1, name_2 = names
    winloss_1, winloss_2 = winloss
    
    try: 
        referee = str(headers[24].get_text()).split()[1] + ' ' + str(headers[24].get_text()).split()[-1]
    except:
        referee = None
    try:
        rounds = str(headers[18].get_text()).split()[1]
    except:
        rounds = None
    try:
        method = str(headers[17].get_text()).split()[0]
    except:
        method = None
    try:
        tr1 = str(trs[1].get_text()).split()
        # Find the location of the 2nd table tr2 (it varies)
        j = 0
        while j < 10:
            if str(trs[j].get_text()).split()[6] == 'Head':
                #print j+1
                tr2 = str(trs[j+1].get_text()).split()
                break
            j += 1
        #print tr1; #print tr2
        
        # Test for the end of names
        k = 0
        while k < len(tr1):
            try:
                int(tr1[k])
                break
            except:
                k += 1
                continue
        #print k
    except:
        print fight_dates[i] + ' bad call'
        bad_call += 1
        continue


    # Add each fighter's information to the dataframe
    fighter1 = pd.DataFrame({'name': [name_1], 'kd': tr1[k], 'sig_strikes': tr1[k+2],
    'sig_attempts': tr1[k+4], 'strikes': tr1[k+10], 'strike_attempts': tr1[k+12], 'takedowns': tr1[k+16],'td_attempts': tr1[k+18],
    'sub_attempts': tr1[k+24], 'pass': tr1[k+26], 'reversals': tr1[k+28], 'head': tr2[k+8], 'head_attempts': tr2[k+10],
    'body': tr2[k+14], 'body_attempts': tr2[k+16], 'leg': tr2[k+20], 'leg_attempts': tr2[k+22], 'distance': tr2[k+26],
    'distance_attempts': tr2[k+28], 'clinch': tr2[k+32], 'clinch_attempts': tr2[k+34], 'ground': tr2[k+38], 
    'ground_attempts': tr2[k+40], 'win/loss': winloss_1, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})

    fighter2 = pd.DataFrame({'name': [name_2], 'kd': tr1[k+1], 'sig_strikes': tr1[k+5], 
    'sig_attempts': tr1[k+7], 'strikes': tr1[k+13], 'strike_attempts': tr1[k+15], 'takedowns': tr1[k+19],'td_attempts': tr1[k+21],
    'sub_attempts': tr1[k+25], 'pass': tr1[k+27], 'reversals': tr1[k+29], 'head': tr2[k+11], 'head_attempts': tr2[k+13],
    'body': tr2[k+17], 'body_attempts': tr2[k+19], 'leg': tr2[k+23], 'leg_attempts': tr2[k+25], 'distance': tr2[k+29],
    'distance_attempts': tr2[k+31], 'clinch': tr2[k+35], 'clinch_attempts': tr2[k+37], 'ground': tr2[k+41], 
    'ground_attempts': tr2[k+43], 'win/loss': winloss_2, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})
    
    fighter_df = pd.concat([fighter_df, fighter1, fighter2], axis=0, ignore_index=True)
    
fighter_df.to_csv('chunks/fights40.csv', index=False)

June 09, 2000 bad call
June 09, 2000 bad call
July 16, 1999 bad call
October 16, 1998 bad call
October 16, 1998 bad call
May 15, 1998 bad call
March 13, 1998 bad call
March 13, 1998 bad call


In [22]:
# Initialize an empty dataframe
fighter_df = pd.DataFrame(columns=['name', 'kd', 'sig_strikes', 'sig_attempts', 'strikes', 'strike_attempts', 
                                   'takedowns', 'td_attempts', 'sub_attempts', 'pass', 'reversals', 'head', 'head_attempts', 'body', 
                                   'body_attempts','leg', 'leg_attempts', 'distance', 'distance_attempts', 'clinch', 'fight_id',
                                   'clinch_attempts', 'ground', 'ground_attempts', 'win/loss', 'referee', 'round', 'method',
                                   'date', 'location', 'title'])

# Iterate through the fight urls, and pull relevant variables/fields
for i in range(4200, len(fight_urls)):
    
    # Store fight event, location and date
    title = fight_titles[i]
    location = fight_locations[i]
    date = fight_dates[i]
    
    sock = urllib.urlopen(fight_urls[i]) # specific URL for a fight
    fight_html = sock.read()
    fight_soup = bs(fight_html, "lxml")
    trs = fight_soup.find_all('tr') # all the tables in each fight URL
    headers = fight_soup.find_all('i')
    bad_call = 0
    
    # Get the name and win/loss status of each fighter
    person_divs = fight_soup.find_all('div', 
                                  class_="b-fight-details__person")
    names = []
    winloss = []
    
    if len(person_divs)<2:
        print i, fight_urls[i], fight_dates[i]
        names = [None, None]
        winloss = [None, None]
        
    for person_div in person_divs:
        i_tag = person_div.find('i')
        try:
            winloss.append(nice_text(i_tag))
        except:
            winloss.append(None)
        h3 = person_div.find('h3')
        try:
            names.append(nice_text(h3))
        except:
            names.append(None)

    name_1, name_2 = names
    winloss_1, winloss_2 = winloss
    
    try: 
        referee = str(headers[24].get_text()).split()[1] + ' ' + str(headers[24].get_text()).split()[-1]
    except:
        referee = None
    try:
        rounds = str(headers[18].get_text()).split()[1]
    except:
        rounds = None
    try:
        method = str(headers[17].get_text()).split()[0]
    except:
        method = None
    try:
        tr1 = str(trs[1].get_text()).split()
        # Find the location of the 2nd table tr2 (it varies)
        j = 0
        while j < 10:
            if str(trs[j].get_text()).split()[6] == 'Head':
                #print j+1
                tr2 = str(trs[j+1].get_text()).split()
                break
            j += 1
        #print tr1; #print tr2
        
        # Test for the end of names
        k = 0
        while k < len(tr1):
            try:
                int(tr1[k])
                break
            except:
                k += 1
                continue
        #print k
    except:
        print fight_dates[i] + ' bad call'
        bad_call += 1
        continue


    # Add each fighter's information to the dataframe
    fighter1 = pd.DataFrame({'name': [name_1], 'kd': tr1[k], 'sig_strikes': tr1[k+2],
    'sig_attempts': tr1[k+4], 'strikes': tr1[k+10], 'strike_attempts': tr1[k+12], 'takedowns': tr1[k+16],'td_attempts': tr1[k+18],
    'sub_attempts': tr1[k+24], 'pass': tr1[k+26], 'reversals': tr1[k+28], 'head': tr2[k+8], 'head_attempts': tr2[k+10],
    'body': tr2[k+14], 'body_attempts': tr2[k+16], 'leg': tr2[k+20], 'leg_attempts': tr2[k+22], 'distance': tr2[k+26],
    'distance_attempts': tr2[k+28], 'clinch': tr2[k+32], 'clinch_attempts': tr2[k+34], 'ground': tr2[k+38], 
    'ground_attempts': tr2[k+40], 'win/loss': winloss_1, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})

    fighter2 = pd.DataFrame({'name': [name_2], 'kd': tr1[k+1], 'sig_strikes': tr1[k+5], 
    'sig_attempts': tr1[k+7], 'strikes': tr1[k+13], 'strike_attempts': tr1[k+15], 'takedowns': tr1[k+19],'td_attempts': tr1[k+21],
    'sub_attempts': tr1[k+25], 'pass': tr1[k+27], 'reversals': tr1[k+29], 'head': tr2[k+11], 'head_attempts': tr2[k+13],
    'body': tr2[k+17], 'body_attempts': tr2[k+19], 'leg': tr2[k+23], 'leg_attempts': tr2[k+25], 'distance': tr2[k+29],
    'distance_attempts': tr2[k+31], 'clinch': tr2[k+35], 'clinch_attempts': tr2[k+37], 'ground': tr2[k+41], 
    'ground_attempts': tr2[k+43], 'win/loss': winloss_2, 'referee': referee, 'round': rounds, 'method': method, 'fight_id': i,
    'date': date, 'location': location, 'title': title})
    
    fighter_df = pd.concat([fighter_df, fighter1, fighter2], axis=0, ignore_index=True)
    
fighter_df.to_csv('chunks/fights42.csv', index=False)

February 07, 1997 bad call
February 07, 1997 bad call
December 07, 1996 bad call
December 07, 1996 bad call
December 07, 1996 bad call
September 20, 1996 bad call
September 20, 1996 bad call
July 12, 1996 bad call
February 16, 1996 bad call
December 16, 1995 bad call
December 16, 1995 bad call
September 08, 1995 bad call
September 08, 1995 bad call
July 14, 1995 bad call
July 14, 1995 bad call
April 07, 1995 bad call
December 16, 1994 bad call
December 16, 1994 bad call
