In [6]:
#------------------standard packages-----------------
import numpy as np
import pandas as pd
import datetime as dt
from tqdm.notebook import tqdm
from thefuzz import fuzz, process

#-----------------load_bz2 packages------------------
import os
from typing import List
from unittest.mock import patch
import tarfile
import zipfile
import bz2
import glob

#-----------------streaming packages-----------------
import logging
import betfairlightweight
from betfairlightweight import StreamListener

#-----------------caio's modules---------------------
from cricsheet_read import cricsheet_read #data processing

#----------------------load bz2----------------------
def load_bz2(file_paths: List[str]): #the path directories to the data sets
    for file_path in file_paths:     #accepts tar files, zipped files or directory with bz2 file(s)
        if os.path.isdir(file_path):
            for path in glob.iglob(file_path + '**/**/*.bz2', recursive=True):
                f = bz2.BZ2File(path, 'rb')
                yield f
                f.close()
        elif os.path.isfile(file_path):
            ext = os.path.splitext(file_path)[1]
            # iterate through a tar archive
            if ext == '.tar':
                with tarfile.TarFile(file_path) as archive:
                    for file in archive:
                        yield bz2.open(archive.extractfile(file))
            # or a zip archive
            elif ext == '.zip':
                with zipfile.ZipFile(file_path) as archive:
                    for file in archive.namelist():
                        yield bz2.open(archive.open(file))

#----------------betfair wrangle--------------------
def betfair_wrangle(file_name_list = ['data_post_apr20', 'data_pre_apr20'], results = cricsheet_read()[1]):
#-----------------betfair login---------------------
    logging.basicConfig(level = logging.ERROR)
    trading = betfairlightweight.APIClient('username', 'password', 'app_key')
    listener = StreamListener(max_latency=None)

#----------------get attributes---------------------
    def market_row(market_book):

        row_dict = {}

        row_dict['market_id'] = market_book.market_id
        row_dict['start_date'] = market_book.publish_time
        #row_dict['status'] = market_book.status
        row_dict['inplay'] = market_book.inplay                                     #blast name fix for warwickshire
        row_dict['team_h'] = market_book.market_definition.runners[0].name.replace('Birmingham Bears', 'Warwickshire')
        row_dict['team_a'] = market_book.market_definition.runners[1].name.replace('Birmingham Bears', 'Warwickshire')
        row_dict['team_h_sp'] = market_book.runners[0].last_price_traded or np.NaN
        row_dict['team_a_sp'] = market_book.runners[1].last_price_traded or np.NaN

        return row_dict

#-----------------process files---------------------
    file_path_list = ['../../data/' + file_name + '.tar' for file_name in file_name_list]
    no_files = len([file for file in load_bz2(file_path_list)])

    match_list = []
    for file in tqdm(load_bz2(file_path_list), total = no_files, desc = 'processing bz2s'):

        stream = trading.streaming.create_historical_generator_stream(file_path = file, listener = listener)

        with patch("builtins.open", lambda f, _: f):
            gen = stream.get_generator()

            match_list += [market_row(market_books[0]) for market_books in gen() if
                           len(market_books[0].runners) == 2]

#---------------get starting prices-----------------
    betfair_data = pd.DataFrame.from_dict(match_list).sort_values(['market_id', 'start_date'])
    betfair_data['start_date'] = betfair_data['start_date'].dt.date
    betfair_data.insert(2, 'match_name', betfair_data['team_h'] + ' v ' + betfair_data['team_a'])
    
    betfair_data = betfair_data[(betfair_data['inplay'].shift(1) != betfair_data['inplay']) &
                                (betfair_data['inplay'] == True)].drop('inplay',
                                 axis = 1).drop_duplicates('market_id').set_index('market_id', drop = True).dropna()
    
#----------------map to master data-----------------
    post_may_15 = results[results['start_date'] >= dt.date(2015, 5, 1)].copy()
    post_may_15['match_name'] = post_may_15['match_name'].str.replace('Birmingham Bears', 'Warwickshire')

    team_dict = {row:{betfair_data.loc[row]['team_h']:betfair_data.loc[row]['team_h_sp'],
                      betfair_data.loc[row]['team_a']:betfair_data.loc[row]['team_a_sp']} for row in betfair_data.index}

    def fuzz_row(match_id):
        row_dict = {}

        nearby_matches = betfair_data[abs(betfair_data['start_date'] - post_may_15['start_date'].loc[match_id]).dt.days < 1]

        match_found = process.extractOne(post_may_15['match_name'].loc[match_id], nearby_matches['match_name'],
                                         score_cutoff = 90, scorer = fuzz.token_set_ratio)
        if match_found is not None:

            row_dict['match_id'] = match_id
            runners = team_dict[match_found[2]]
            for team in ['set_', 'chase_']:
                row_dict[team + 'odds'] = runners[process.extractOne(results[team + 'team'].loc[match_id], runners.keys(),
                                                                     score_cutoff = 0, scorer = fuzz.token_set_ratio)[0]]
        else:
            for col in ['match_id', 'set_odds', 'chase_odds']:
                row_dict[col] = np.NaN
                
        return row_dict

    betfair_data = pd.DataFrame.from_dict([fuzz_row(row) for row in
                                           tqdm(post_may_15.index, desc = 'mapping marktet_ids to match_id')])

    print('master_data odds percentage:', str(np.round((betfair_data.notna().sum()[0]/len(betfair_data))*100,2)) + '%')
    betfair_data = betfair_data.dropna().set_index('match_id')
    
#---------------------save csv----------------------
    betfair_data.to_csv('../../data/betfair_data.csv')
#---------------------------------------------------
betfair_wrangle()

processing bz2s:   0%|          | 0/10598 [00:00<?, ?it/s]

mapping marktet_ids to match_id:   0%|          | 0/2217 [00:00<?, ?it/s]

master_data odds percentage: 91.52%
