In [3]:
import json
import requests
import pprint
import os
import re
import time
import datetime

## Part 1: Downloading data from API and joining in one big JSON file

In [13]:
start_time = time.time()
print "Starting at " + str(datetime.datetime.fromtimestamp(start_time))
BASE_URL = "http://www.playlivemanager.com/api/players/"
ROUND_URL_EXT = "/stats/round"
PLAYER_IDs = range(64400,64402)

Starting at 2016-02-04 12:46:09.657051


In [14]:
def query_by_id(playerID):
    print "Processing Player with ID " + playerID
    url = BASE_URL + playerID + ROUND_URL_EXT
    r = requests.get(url)
    print "requesting", r.url
    print r

    if r.status_code == requests.codes.ok:
        return r.json()
        print r.json()
    else:
        r.raise_for_status()

In [15]:
def download_data():
    for playerID in PLAYER_IDs:        
        results = query_by_id(str(playerID))
#        pprint.pprint(results)
        with open("../data/{0}.json".format(playerID), mode='w') as f:
            json.dump(results, f)
            print "Stored data for player " + str(playerID) + " in " + f.name

In [16]:
def transform_data():
    print "Deleting old files"
    try:
        os.remove("../data/all.json")
    except:
        pass
    print "Begin transforming data"
    for playerID in PLAYER_IDs:
        with open("../data/all.json", mode='a') as all:
            f = open("../data/{0}.json".format(playerID), 'r')
            print "Processing " + f.name
            file = f.read()
            file = file.strip()
            ## remove root
            file = re.sub('{\"status\": \"ok\", \"response\": \[', "", file)
            ## remove superfluous ']}' at the end and replace with ','
            file = re.sub("}}]}]}", "}}]},", file)
            #aggregate in one big json file
            #json.dump(file, all)
            all.write(file)
            #all.close()
            print "Added " + f.name + " to " + all.name

    # wrap content in '[]', replacing last ','
    with open("../data/allfinal.json", mode='w') as all:
        print "Wrapping file in json format"
        reader = open("../data/all.json", 'r').read()
        reader = '[' + reader
        reader = re.sub(r"\,$", r"]", reader)
        reader = reader.strip()
        all.write("%s\n" % reader)
    
    print "All data stored in " + all.name

In [374]:
download_data()

Processing Player with ID 64400
requesting http://www.playlivemanager.com/api/players/64400/stats/round
<Response [200]>
Stored data for player 64400 in data/64400.json
Processing Player with ID 64401
requesting http://www.playlivemanager.com/api/players/64401/stats/round
<Response [200]>
Stored data for player 64401 in data/64401.json


In [17]:
transform_data()

Deleting old files
Begin transforming data
Processing ../data/64400.json
Added ../data/64400.json to ../data/all.json
Processing ../data/64401.json
Added ../data/64401.json to ../data/all.json
Processing ../data/64402.json
Added ../data/64402.json to ../data/all.json
Processing ../data/64403.json
Added ../data/64403.json to ../data/all.json
Processing ../data/64404.json
Added ../data/64404.json to ../data/all.json
Processing ../data/64405.json
Added ../data/64405.json to ../data/all.json
Processing ../data/64406.json
Added ../data/64406.json to ../data/all.json
Processing ../data/64407.json
Added ../data/64407.json to ../data/all.json
Processing ../data/64408.json
Added ../data/64408.json to ../data/all.json
Processing ../data/64409.json
Added ../data/64409.json to ../data/all.json
Processing ../data/64410.json
Added ../data/64410.json to ../data/all.json
Processing ../data/64411.json
Added ../data/64411.json to ../data/all.json
Processing ../data/64412.json
Added ../data/64412.json to

## Part 2: Converting the JSON file to CSV

In [None]:
## TEMP: correct shitty formatting errors from API
## store command line commands for now, see if error still persists next week
#sed -r -i.bak 's/\[\]\}\]\}\{\"ass/\[\]\}\,\{\"ass/g' allfinal.json
#sed -r -i.bak 's/\[\]\}\]\}\{\"clean/\[\]\}\,\{\"clean/g' allfinal.json

In [21]:
## adapted rom https://gist.github.com/tjvc/12e393c48e0025509021

from collections import OrderedDict
import csv
import json
import sys

infile = "../data/allfinal.json"
outfile = open("../data/playerdata.csv", "w")

writer = csv.writer(outfile, delimiter=",")

data = json.load(open(infile), object_pairs_hook=OrderedDict)

# Recursively flatten JSON
def flatten(structure, key="", path="", flattened=None): 
    if flattened is None:
        flattened = OrderedDict()    
    if type(structure) not in(OrderedDict, list):
        flattened[((path + "_") if path else "") + key] = structure    
    elif isinstance(structure, list):
        for i, item in enumerate(structure):
            flatten(item, "", path + "_" + key, flattened)    
    else:
        for new_key, value in structure.items():
            flatten(value, new_key, path + "_" + key, flattened)    
    return flattened

# Write fields
fields = []
for result in data:
    flattened = flatten(result)
    for k, v in flattened.iteritems():
        if k not in fields:
            fields.append(k)
writer.writerow(fields)
print "Flattened JSON to the following fields:"
print fields

# Write values
for result in data:
    flattened = flatten(result)
    row = []
    for field in fields:
        if field in flattened.iterkeys():
            row.append(flattened[field])
        else:
            row.append("")
    writer.writerow(row)
print "Stored CSV as " + outfile.name

Flattened JSON to the following fields:
[u'__assist', u'__clean_sheet', u'__goal', u'__matchday', u'__game_play_duration', u'__total_earnings', u'__matches_info__away_shortname', u'__matches_info__away_squad', u'__matches_info__away_score', u'__matches_info__time_on_pitch', u'__matches_info__player_status_status', u'__matches_info__player_status_id', u'__matches_info__player_status_match_id', u'__matches_info__period', u'__matches_info__finished', u'__matches_info__home_shortname', u'__matches_info__home_squad', u'__matches_info__home_score', u'__matches_info__player_status_in_for_player', u'__matches_info__player_status_sub_in_timestamp', u'__matches_info__player_status_sub_in_minutes', u'__matches_info__player_status_sub_out_minutes', u'__matches_info__player_status_sub_out_timestamp', u'__shot_on_target', u'__attempt_saved', u'__successful_pass']
Stored CSV as ../data/playerdata.csv


In [450]:
print "Started at " + str(datetime.datetime.fromtimestamp(start_time))
print "Finished at " + str(datetime.datetime.fromtimestamp(time.time()))

Started at 2015-12-10 19:27:24.402078
Finished at 2015-12-10 19:42:28.024259
