# Knihovny

In [3]:
import json
import pandas as pd
from datetime import datetime, timedelta
import os

# Funkce

In [4]:
def create_dataframes_dict(bookmaker_names):
    # script_dir = os.path.dirname(os.path.realpath(__file__))
    dataframes_dict = {}
    for bookmaker_name in bookmaker_names:
        file_path = os.path.join(f"../data/data_{bookmaker_name}.json") # join(script_dir, f"../data/data_{bookmaker_name}.json")
        with open(file_path, 'r') as file:
            data = json.load(file)
        df = pd.DataFrame(data)[[
            'bookmaker_id',
            'bookmaker_name',
            'sport_name',
            'sport_detail',
            'country_name',
            'event_startTime',
            'participant_home_list',
            'participant_away_list',
            'participants_gender',
            'participants_age',
            'bet_1',
            'bet_0',
            'bet_2',
            'bet_10',
            'bet_02',
            'bet_12',
            'bet_11',
            'bet_22',
            'event_id',
            'event_url'
        ]].apply(lambda col: col.map(lambda x: tuple(x) if isinstance(x, list) else x)).drop_duplicates()
        df['event_startTime'] = pd.to_datetime(df['event_startTime'])
        df['bet_list'] = tuple(zip(df['bet_1'], df['bet_0'], df['bet_2'], df['bet_10'], df['bet_02'], df['bet_12'], df['bet_11'], df['bet_22']))
        dataframes_dict[bookmaker_name] = df.loc[df['sport_name'] != 'special'].reset_index(drop = True)
    return dataframes_dict

def filter_only_close_days(number_of_days, dataframes_dict):
    number_of_days = int(number_of_days)
    today = datetime.now()
    start_date = today.replace(hour=0, minute=0, second=0, microsecond=0)
    end_date = (today + timedelta(days = number_of_days)).replace(hour=23, minute=59, second=59)
    for bookmaker_name in dataframes_dict.keys():
        dataframes_dict[bookmaker_name] = dataframes_dict[bookmaker_name][
            (dataframes_dict[bookmaker_name]['event_startTime'] >= start_date) &
            (dataframes_dict[bookmaker_name]['event_startTime'] <= end_date)
        ]
    return dataframes_dict

def clean_participant_names(dataframes_dict, printIndexDrops = False):
    for bookmaker_name, df in dataframes_dict.items():
        indexes_to_drop = set()
        df['participant_home_list_smaller'] = None
        df['participant_away_list_smaller'] = None
        for idx, row in df.iterrows():
            match_df = df[
                (df['sport_name'] == row['sport_name']) &
                df['sport_detail'].apply(lambda x: x == row['sport_detail'] or x == 'other' or row['sport_detail'] == 'other') &
                df['country_name'].apply(lambda x: x == row['country_name'] or x == 'other' or row['country_name'] == 'other') &
                (df['event_startTime'] == row['event_startTime']) &
                df['participant_home_list'].apply(lambda x: bool(set(x) & set(row['participant_home_list']))) &
                df['participant_away_list'].apply(lambda x: bool(set(x) & set(row['participant_away_list']))) &
                (df['participants_gender'] == row['participants_gender']) &
                (df['participants_age'] == row['participants_age']) &
                (df['event_url'] != row['event_url'])
            ]
            if len(match_df) > 0:
                delete_home_list = tuple(set(item for sublist in match_df['participant_home_list'] for item in sublist))
                delete_away_list = tuple(set(item for sublist in match_df['participant_away_list'] for item in sublist))
                home_list = tuple([word for word in row['participant_home_list'] if word not in delete_home_list])
                away_list = tuple([word for word in row['participant_away_list'] if word not in delete_away_list])
                if home_list and away_list:
                    df.at[idx, 'participant_home_list_smaller'] = home_list
                    df.at[idx, 'participant_away_list_smaller'] = away_list
                else:
                    indexes_to_drop.add(idx)
        if printIndexDrops:
            print(bookmaker_name, indexes_to_drop)
        df.drop(index = indexes_to_drop, inplace = True)
        df['participant_home_list'] = df.apply(lambda row: row['participant_home_list_smaller'] if row['participant_home_list_smaller'] is not None else row['participant_home_list'], axis=1)
        df['participant_away_list'] = df.apply(lambda row: row['participant_away_list_smaller'] if row['participant_away_list_smaller'] is not None else row['participant_away_list'], axis=1)
        df.drop('participant_home_list_smaller', axis=1, inplace=True)
        df.drop('participant_away_list_smaller', axis=1, inplace=True)
        df.reset_index(drop = True, inplace = True)
    return dataframes_dict

def create_events_table(dataframes_dict, dropBets = True):
    df_all = pd.concat(dataframes_dict.values(), axis=0).reset_index(drop=True)
    bookmaker_names = dataframes_dict.keys()
    
    events = pd.DataFrame(columns=[
        'events_id',
        'bookmaker_list',
        'sport_name',
        'sport_detail',
        'country_name',
        'event_startTime',
        'participant_home_list',
        'participant_away_list',
        'participants_gender',
        'participants_age',
        'bet_dict',
        'event_id_dict',
        'event_url_dict'
    ])
    errors = []
        
    for idx, row in df_all.iterrows():

        url_events = events[events['event_url_dict'].apply(lambda x: row['event_url'] in x.values())]
        if url_events.empty: # url tohoto zapasu jeste neni v tabulce events
            isError = False

            sport_detail = row['sport_detail']
            country_name = row['country_name']
            participant_home_list = row['participant_home_list']
            participant_away_list = row['participant_away_list']
            bet_dict = {item: tuple() for item in bookmaker_names}
            bet_dict[row['bookmaker_name']] = row['bet_list']
            event_id_dict = {item: None for item in bookmaker_names}
            event_id_dict[row['bookmaker_name']] = row['event_id']
            event_url_dict = {item: None for item in bookmaker_names}
            event_url_dict[row['bookmaker_name']] = row['event_url']

            bookmaker_search_dict = {item: False for item in bookmaker_names}
            bookmaker_search_dict[row['bookmaker_name']] = True
            not_searched_bookmakers_list = [key for key, value in bookmaker_search_dict.items() if value is False]

            while not_searched_bookmakers_list:
                bookmaker_name_searching = not_searched_bookmakers_list[0]
                bookmaker_df_searching = dataframes_dict[bookmaker_name_searching]
                match_df = bookmaker_df_searching[
                    (bookmaker_df_searching['sport_name'] == row['sport_name']) &
                    bookmaker_df_searching['sport_detail'].apply(lambda x: x == sport_detail or x == 'other' or sport_detail == 'other') &
                    bookmaker_df_searching['country_name'].apply(lambda x: x == country_name or x == 'other' or country_name == 'other') &
                    (bookmaker_df_searching['event_startTime'] == row['event_startTime']) &
                    bookmaker_df_searching['participant_home_list'].apply(lambda x: bool(set(x) & set(participant_home_list))) &
                    bookmaker_df_searching['participant_away_list'].apply(lambda x: bool(set(x) & set(participant_away_list))) &
                    (bookmaker_df_searching['participants_gender'] == row['participants_gender']) &
                    (bookmaker_df_searching['participants_age'] == row['participants_age'])
                ]
                if len(match_df) == 0: # v prohledavane tabulce nemame shodu
                    bookmaker_search_dict[bookmaker_name_searching] = True
                    not_searched_bookmakers_list = [key for key, value in bookmaker_search_dict.items() if value is False]
                elif len(match_df) == 1: # v prohledavane tabulce mame prave jednu shodu
                    match = match_df.iloc[0]
                    if sport_detail == 'other':
                        sport_detail = match['sport_detail']
                    if country_name == 'other':
                        country_name = match['country_name']
                    participant_home_list = tuple(set(participant_home_list + match['participant_home_list']))
                    participant_away_list = tuple(set(participant_away_list + match['participant_away_list']))
                    bet_dict[bookmaker_name_searching] = match['bet_list']
                    event_id_dict[bookmaker_name_searching] = match['event_id']
                    event_url_dict[bookmaker_name_searching] = match['event_url']
                    bookmaker_search_dict = {key: (value is not None) for key, value in event_url_dict.items()}
                    not_searched_bookmakers_list = [key for key, value in bookmaker_search_dict.items() if value is False]
                else: # v prohledavane tabulce mame vice nez jednu shodu -> error
                    error_dict = {
                        'sport_name': row['sport_name'],
                        'sport_detail': sport_detail,
                        'country_name': country_name,
                        'event_startTime': row['event_startTime'],
                        'participant_home_list': participant_home_list,
                        'participant_away_list': participant_away_list,
                        'participants_gender': row['participants_gender'],
                        'participants_age': row['participants_age'],
                        'event_id_dict': event_id_dict,
                        'event_url_dict': event_url_dict,
                        'match_df': match_df
                    }
                    errors.append(error_dict)

                    isError = True
                    event_url_dict[bookmaker_name_searching] = 'error'
                    bookmaker_search_dict[bookmaker_name_searching] = True
                    not_searched_bookmakers_list = [key for key, value in bookmaker_search_dict.items() if value is False]
                    for error_idx, error_row in match_df.iterrows():
                        new_error_row = {
                            'events_id': 'error',
                            'bookmaker_list': ('error',),
                            'sport_name': error_row['sport_name'],
                            'sport_detail': error_row['sport_detail'],
                            'country_name': error_row['country_name'],
                            'event_startTime': error_row['event_startTime'],
                            'participant_home_list': error_row['participant_home_list'],
                            'participant_away_list': error_row['participant_away_list'],
                            'participants_gender': error_row['participants_gender'],
                            'participants_age': error_row['participants_age'],
                            'bet_dict': {item: tuple() for item in bookmaker_names},
                            'event_id_dict': {item: None for item in bookmaker_names},
                            'event_url_dict': {item: None for item in bookmaker_names}
                        }
                        new_error_row['bet_dict'][bookmaker_name_searching] = error_row['bet_list']
                        new_error_row['event_id_dict'][bookmaker_name_searching] = error_row['event_id']
                        new_error_row['event_url_dict'][bookmaker_name_searching] = error_row['event_url']
                        new_error_row_df = pd.DataFrame([new_error_row])
                        events = pd.concat([events, new_error_row_df], ignore_index=True)
            if isError:
                bookmaker_list = ('error',)
            else:
                bookmaker_list = tuple([key for key, value in event_url_dict.items() if value is not None])
            events_id = '_'.join([item for item in event_id_dict.values() if item is not None])
            new_row = {
                'events_id': events_id,
                'bookmaker_list': bookmaker_list,
                'sport_name': row['sport_name'],
                'sport_detail': sport_detail,
                'country_name': country_name,
                'event_startTime': row['event_startTime'],
                'participant_home_list': participant_home_list,
                'participant_away_list': participant_away_list,
                'participants_gender': row['participants_gender'],
                'participants_age': row['participants_age'],
                'bet_dict': bet_dict,
                'event_id_dict': event_id_dict,
                'event_url_dict': event_url_dict
            }
            new_row_df = pd.DataFrame([new_row])
            # events = pd.concat([events, new_row_df], ignore_index=True)
            dfs_to_concat = [df for df in [events, new_row_df] if not df.empty and not df.isna().all().all()]
            events = pd.concat(dfs_to_concat, ignore_index=True)
    
    if dropBets:
        events.drop('bet_dict', axis=1, inplace=True)
        events.reset_index(drop = True, inplace = True)
    return events

# Main

In [5]:
# import subprocess


# subprocess.run(["python", "crawler.py"])

In [6]:
# nahrat puvodni data o eventech do dataframu

bookmaker_names = ['betano', 'betx', 'forbet', 'fortuna', 'kingsbet', 'merkur', 'sazka', 'synottip', 'tipsport']
dataframes_dict = create_dataframes_dict(bookmaker_names)

In [7]:
# vyfiltrovat pouze omezeny pocet dnu nasledujicihc po dnu stazeni dat

number_of_days = 0
dataframes_dict = filter_only_close_days(number_of_days, dataframes_dict)

In [8]:
# vycistit slova navic v listech obsahujicich jmena jednotlivych tymu

printIndexDrops = False
dataframes_dict = clean_participant_names(dataframes_dict, printIndexDrops)

In [9]:
# vytvorit dataframe obsahujici vsechny puvodni eventy

# df_all = pd.concat(dataframes_dict.values(), axis=0).reset_index(drop=True)
# len(df_all)

In [10]:
# vytvorit spojenou tabulku pro vsechny puvodni eventy

dropBets = True
events = create_events_table(dataframes_dict, dropBets)

In [11]:
# oddelit eventy s errorem

# events_with_error = events[events['bookmaker_list'].apply(lambda x: 'error' in x)]
events = events[~events['bookmaker_list'].apply(lambda x: 'error' in x)]

In [12]:
# oddelit eventy, ktere se na nic nenapoji

# events_singlies = events[events['bookmaker_list'].apply(lambda x: len(x) == 1)]
events = events[~events['bookmaker_list'].apply(lambda x: len(x) == 1)]

In [13]:
lists_for_detail_spiders_dict = {}
for bookmaker_name in bookmaker_names:
    lists_for_detail_spiders_dict[bookmaker_name] = []
for sport_name, event_url_dict in zip(events['sport_name'], events['event_url_dict']):
    for bookmaker_name, event_url in event_url_dict.items():
        if event_url is not None:
            lists_for_detail_spiders_dict[bookmaker_name].append({
                'sport_name': sport_name,
                'event_url': event_url
            })

In [14]:
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from betscraper.spiders import spider_betano_detail, spider_betx_detail, spider_forbet_detail, spider_fortuna_detail, spider_kingsbet_detail, spider_merkur_detail, spider_sazka_detail, spider_synottip_detail, spider_tipsport_detail
# from betscraper.spiders import spider_livescore


settings = Settings()
settings.setmodule('betscraper.settings', priority='project')
process = CrawlerProcess(settings)

process.crawl(spider_betano_detail.SpiderBetanoDetailSpider, arg_data = lists_for_detail_spiders_dict['betano']) #, arg_sport_name = sport_name, arg_events_limit = events_limit) #, arg_event_url = 'https://www.betano.cz/zapas-sance/teplice-sk-slavia-praha/58204893/')
process.crawl(spider_betx_detail.SpiderBetxDetailSpider, arg_data = lists_for_detail_spiders_dict['betx']) #, arg_sport_name = sport_name, arg_events_limit = events_limit) #, arg_event_url = 'https://bet-x.cz/cs/sports-betting/offer/soccer?match=50936435')
process.crawl(spider_forbet_detail.SpiderForbetDetailSpider, arg_data = lists_for_detail_spiders_dict['forbet']) #, arg_sport_name = sport_name, arg_events_limit = events_limit) #, arg_event_url = 'https://www.fbet.cz/prematch/event/MA50936435')
process.crawl(spider_fortuna_detail.SpiderFortunaDetailSpider, arg_data = lists_for_detail_spiders_dict['fortuna']) #, arg_sport_name = sport_name, arg_events_limit = events_limit) #, arg_event_url = 'https://www.ifortuna.cz/sazeni/fotbal/1-cesko/teplice-slavia-praha-MCZ149614999')
process.crawl(spider_kingsbet_detail.SpiderKingsbetDetailSpider, arg_data = lists_for_detail_spiders_dict['kingsbet']) #, arg_sport_name = sport_name, arg_events_limit = events_limit) #, arg_event_url = 'https://www.kingsbet.cz/sport?page=event&eventId=10301025')
process.crawl(spider_merkur_detail.SpiderMerkurDetailSpider, arg_data = lists_for_detail_spiders_dict['merkur']) #, arg_sport_name = sport_name, arg_events_limit = events_limit) #, arg_event_url = 'https://www.merkurxtip.cz/sazeni/online/fotbal/S/1-liga/2334015/special/teplice-v-slavia-prague/130050705')
process.crawl(spider_sazka_detail.SpiderSazkaDetailSpider, arg_data = lists_for_detail_spiders_dict['sazka']) #, arg_sport_name = sport_name, arg_events_limit = events_limit) #, arg_event_url = 'https://www.sazka.cz/kurzove-sazky/sports/event/1818260')
process.crawl(spider_synottip_detail.SpiderSynottipDetailSpider, arg_data = lists_for_detail_spiders_dict['synottip']) #, arg_sport_name = sport_name, arg_events_limit = events_limit) #, arg_event_url = 'https://sport.synottip.cz/zapasy/12cx13cxx27/2416017cxx27/241943424?categoryId=12cx13cxx27')
process.crawl(spider_tipsport_detail.SpiderTipsportDetailSpider, arg_data = lists_for_detail_spiders_dict['tipsport']) #, arg_sport_name = sport_name, arg_events_limit = events_limit) #, arg_event_url = 'https://www.tipsport.cz/kurzy/zapas/fotbal-teplice-slavia-praha/6215185')

process.start()

2024-12-19 01:26:55 [scrapy.utils.log] INFO: Scrapy 2.11.2 started (bot: betscraper)
2024-12-19 01:26:55 [scrapy.utils.log] INFO: Versions: lxml 5.2.2.0, libxml2 2.12.6, cssselect 1.2.0, parsel 1.9.1, w3lib 2.2.1, Twisted 24.3.0, Python 3.11.9 (main, Apr 19 2024, 16:48:06) [GCC 11.2.0], pyOpenSSL 24.2.1 (OpenSSL 3.3.1 4 Jun 2024), cryptography 43.0.0, Platform Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.35
2024-12-19 01:26:55 [scrapy.addons] INFO: Enabled addons:
[]
2024-12-19 01:26:55 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelectorReactor
2024-12-19 01:26:55 [scrapy.utils.log] DEBUG: Using asyncio event loop: asyncio.unix_events._UnixSelectorEventLoop


2024-12-19 01:26:55 [scrapy.extensions.telnet] INFO: Telnet Password: 36d50b6a4103588a
2024-12-19 01:26:58 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2024-12-19 01:26:58 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'betscraper',
 'CONCURRENT_REQUESTS': 512,
 'CONCURRENT_REQUESTS_PER_DOMAIN': 512,
 'FEED_EXPORT_ENCODING': 'utf-8',
 'NEWSPIDER_MODULE': 'betscraper.spiders',
 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',
 'SPIDER_MODULES': ['betscraper.spiders'],
 'TWISTED_REACTOR': 'twisted.internet.asyncioreactor.AsyncioSelectorReactor',
 'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64; rv:34.0) Gecko/20100101 '
               'Firefox/34.0'}
2024-12-19 01:26:59 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.offsite.Offsi

RuntimeError: This event loop is already running

2024-12-19 01:27:03 [scrapy-playwright] INFO: Starting download handler
2024-12-19 01:27:03 [scrapy-playwright] INFO: Starting download handler
2024-12-19 01:27:03 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.betano.cz/api/zapas-sance/sultanes-de-monterrey-naranjeros-de-hermosillo/60402963/?bt=99&req=la,t,s,stnf,c,mb,mbl> (referer: None)
2024-12-19 01:27:03 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.betano.cz/api/zapas-sance/tomateros-de-culiacan-mayos-de-navojoa/60330626/?bt=99&req=la,t,s,stnf,c,mb,mbl> (referer: None)
2024-12-19 01:27:03 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://sb2frontend-altenar2.biahosted.com/api/widget/GetEventDetails?culture=cs-CZ&timezoneOffset=-60&integration=kingsbet&deviceType=1&numFormat=en-GB&countryCode=CZ&eventId=10354195> (referer: None)
2024-12-19 01:27:03 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://sb2frontend-altenar2.biahosted.com/api/widget/GetEventDetails?culture=cs-CZ&timezoneOffset=-60&integ

: 

In [40]:
from collections import Counter

# Example list
numbers = [len(x) for x in list(events['bookmaker_list'])]

# Count occurrences
counts = Counter(numbers)

# Print counts for numbers 1 through 9
for num in range(1, 10):
    print(f"Number {num}: {counts[num]}")

Number 1: 0
Number 2: 210
Number 3: 121
Number 4: 68
Number 5: 108
Number 6: 62
Number 7: 40
Number 8: 40
Number 9: 63
