In [47]:
import json
import re

import requests
import scrapy

In [48]:
headers = {'User-Agent': 'UNC Journo Class'}

In [49]:
#get the url from the volleyball roster 

In [50]:
base_url = 'http://goheels.com'
url = base_url + '//roster.aspx?path=wvball'

In [51]:
resp = requests.get(url, headers=headers)

In [52]:
body_str = resp.content.decode('utf-8')

In [53]:
sel = scrapy.Selector(text=body_str)

In [54]:
#find the table of data

In [55]:
table = sel.css('table')[0]


In [56]:
cols = table.css('th').xpath('string()').extract()

In [57]:
rows = table.css('tr')[1:]

In [58]:
#build a player list and a data library. iterate through the data to get all the names & info. codes reference from Scrape_Heels_Baseball.ipynb

In [59]:
players = []
for r in rows:
    data = {}
    for i, d in enumerate(r.css('td')):
        link = d.css('a')
        if link:
            t = link.xpath('text()').extract()[0]
            data['href'] = link.xpath('@href').extract()[0]
        else:
            t = d.xpath('text()').extract()
        data[cols[i]] = t
    players.append(data)

In [None]:
#get the bio from all players. codes reference from Scrape_Heels_Baseball.ipynb

In [60]:
def fetch_bio(player):
    player_url = base_url + player['href']
    resp = requests.get(player_url, headers=headers)
    player_txt = resp.content.decode('utf-8')
    sel = scrapy.Selector(text=player_txt)
    player['sel'] = sel
    player['bio'] = sel.css('#sidearm-roster-player-bio').xpath('string()').extract()[0]
    player['img'] = sel.css('.sidearm-roster-player-image img').xpath('@src').extract()[0]

In [61]:
#get stats of the players. all these codes come from Scrape_Heels_Baseball.ipynb 

In [25]:
js_obj_rx = re.compile(r'.*?responsive-roster-bio\.ashx.*?(?P<obj>{.*?})')

In [26]:
def fetch_stats(player):
    text = player['sel'].xpath('string()').extract()[0]
    parts = text.split('$.getJSON("/services/')[1:]
    captured = js_obj_rx.findall(''.join(parts))
    clean_objs = []
    for obj_str in captured:
        # We only want the stats object...
        if 'stats' not in obj_str:
            continue

        obj_str = obj_str.replace('{', '').replace('}', '')
        obj_str = obj_str.replace("'", '').replace('"', '')
        obj_pairs = obj_str.split(',')
        obj_pairs = [x.split(":") for x in obj_pairs]
        clean_pairs = []
        for pair in obj_pairs:
            clean_pairs.append(['"{}"'.format(p.strip()) for p in pair])
        colonized = [":".join(p) for p in clean_pairs]
        commas = ','.join(colonized)
        json_str = "{" + commas + "}"
        clean_objs.append(json.loads(json_str))
    
    player['stats_url'] = stats_url = (
        "http://goheels.com/services/responsive-roster-bio.ashx?"
        "type={type}&rp_id={rp_id}&path={path}&year={year}"
        "&player_id={player_id}"
    ).format(**clean_objs[0])

    resp = requests.get(stats_url, headers=headers)
    json_stats = json.loads(resp.content.decode("utf-8"))
    player['raw_stats'] = json_stats

In [None]:
#fetch everything. this takes forever on my old computer. codes reference from Scrape_Heels_Baseball.ipynb

In [65]:
for p in players:
    fetch_bio(p)
    fetch_stats(p)

In [66]:
#get what I'll be working on

In [67]:
txt = p['raw_stats']['career_stats']
sel = scrapy.Selector(text=txt)

In [68]:
#get the player's name

In [69]:
def get_name(player):
        return player['Full Name']

In [70]:
#get the year of the data

In [71]:
def get_data_year(player):
        txt = player['raw_stats']['career_stats']
        sel = scrapy.Selector(text=txt)
        table = sel.css('.sidearm-table')
        chart_col=table.css('tr')[1].css('th').xpath('string()').extract()
        return chart_col


In [76]:
#get the colums of the data

In [77]:
def get_col(player):
        txt = player['raw_stats']['career_stats']
        sel = scrapy.Selector(text=txt)
        table = sel.css('.sidearm-table')
        chart_col=table.css('tr')[3].css('th').xpath('string()').extract()
        chart_col.pop(0)
        return chart_col


In [79]:
#get data points

In [80]:
def get_data(player):
    txt = player['raw_stats']['career_stats']
    sel = scrapy.Selector(text=txt)
    table = sel.css('.sidearm-table')
    chart_data=table.css('tr')[2].css('td').xpath('string()').extract()
    return chart_data

In [84]:
#get the second row of data

In [85]:
def get_data_2(player):
    txt = player['raw_stats']['career_stats']
    sel = scrapy.Selector(text=txt)
    table = sel.css('.sidearm-table')
    chart_data=table.css('tr')[4].css('td').xpath('string()').extract()
    return chart_data

In [89]:
#get the title of the chart

In [90]:
def get_chart_title_1(player):
    txt = player['raw_stats']['career_stats']
    sel = scrapy.Selector(text=txt)
    caption = sel.css('.hide')
    title = caption.css('caption').xpath('string()').extract()
    title.pop(1)
    return title
    

In [100]:
#get the title of the other chart

In [92]:
def get_chart_title_2(player):
    txt = player['raw_stats']['career_stats']
    sel = scrapy.Selector(text=txt)
    caption = sel.css('.hide')
    title = caption.css('caption').xpath('string()').extract()
    title.pop(0)
    return title
    

In [101]:
#start parsing!!!

In [102]:
player_stats = []

In [103]:
def parse_stats(player):
    for raw_key, raw_val in player['raw_stats'].items():
        txt = player['raw_stats'][raw_key]
        if not txt:
            print('Skipping {} for {}'.format(raw_key, player['Full Name']))
            continue
    txt = player['raw_stats']['career_stats']
    sel = scrapy.Selector(text=txt)
    player_stats.append({
            "name":get_name(player),
            "Chart#1":get_chart_title_1(player),
            "cols_1":get_col(player),
            "cols_2":get_data(player),
            "Chart#2":get_chart_title_2(player),
            "cols_1":get_col_2(player),
            "cols_2":get_data_2(player),
            
        })
    print(player_stats)
    return player_stats
    
        


In [97]:
for p in players:
    parse_stats(p)

Skipping current_stats for Mariah Evans
Skipping gamehigh_stats for Mariah Evans
[{'name': 'Mariah Evans', 'Chart#1': ['Career Offensive Statistics'], 'cols_1': ['DIG', 'D/S', 'RE', 'BS', 'BA', 'TB', 'B/S', 'BE', 'BHE', 'PTS', 'PTS/S'], 'cols_2': ['168', '1.38', '0', '0', '2', '2', '0.02', '0', '27', '25.0', '0.20'], 'Chart#2': ['Career Defensive Statistics']}]
Skipping current_stats for Kendra Koetter
Skipping gamehigh_stats for Kendra Koetter
[{'name': 'Mariah Evans', 'Chart#1': ['Career Offensive Statistics'], 'cols_1': ['DIG', 'D/S', 'RE', 'BS', 'BA', 'TB', 'B/S', 'BE', 'BHE', 'PTS', 'PTS/S'], 'cols_2': ['168', '1.38', '0', '0', '2', '2', '0.02', '0', '27', '25.0', '0.20'], 'Chart#2': ['Career Defensive Statistics']}, {'name': 'Kendra Koetter', 'Chart#1': ['Career Offensive Statistics'], 'cols_1': ['DIG', 'D/S', 'RE', 'BS', 'BA', 'TB', 'B/S', 'BE', 'BHE', 'PTS', 'PTS/S'], 'cols_2': ['146', '1.54', '1', '0', '0', '0', '0.00', '2', '18', '24.0', '0.25'], 'Chart#2': ['Career Defensive

ValueError: Selector needs either text or root argument

In [None]:
#dump everything to the jason file

In [98]:
to_dump = player_stats
with open('scraped_players.json', 'w') as f:
    json.dump(to_dump, f)

In [None]:
#I don't know what this line does but I copied it from Scrape_Heels_Baseball.ipynb

In [99]:
cat scraped_players.json | cut -c 1-100

[{"name": "Mariah Evans", "Chart#1": ["Career Offensive Statistics"], "cols_1": ["DIG", "D/S", "RE",
