In [6]:
%load_ext autoreload
%autoreload 1

import logging
import pickle
import sqlite3
import sys
from collections import OrderedDict

import numpy as np
import pandas as pd

sys.path.append('..')
import src.data.interim.matches as matches
%aimport src.data.interim.matches

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
# load loteca
loteca = pd.read_pickle('../data/pre/lotecas_matches.pkl')
loteca = loteca[loteca.roundno >= 366]  # exclude old rounds

# load betexplorer
conn = sqlite3.connect('../data/raw/betexplorer.sqlite3')
betexp = pd.read_sql_query('SELECT id, league_category, date, teamH, teamA, score, scoremod FROM matches', conn)
conn.close()

betexp.date = pd.to_datetime(betexp.date, dayfirst=True)
betexp.score = betexp.score.str.strip()
betexp['goalsH'] = [int(score.split(':')[0]) if score else np.nan for score in betexp.score]
betexp['goalsA'] = [int(score.split(':')[1]) if score else np.nan for score in betexp.score]

# load dict
with open('../data/interim/teams_ltb.pkl', mode='rb') as fp:
    teamsd = pickle.load(fp)

In [8]:
MANUAL_TEAMS = {
     'ATLÉTICO MADRID/ESP': {'Atl. Madrid'},
}

for k, v in MANUAL_TEAMS.items():
    teamsd[k] |= v

In [9]:
loteca0 = loteca[~loteca.happened]
loteca1 = loteca[loteca.happened]

In [10]:
ret = OrderedDict()

# Link matches we are sure
# (close date) (same score) (2 teams the same)*
df = loteca1
results = matches.generate_matches_dict(df, betexp, teamsd, logger=logger,                                             
                                         flexible_date=True,
                                         min_team_rigid_points=2, 
                                         min_team_flex_points=2,
                                         return_teams=False)
ret.update(results)

# Discover some new teams
# (close date) (same score) (1 team the same) (other team alike)*
df = df[~df.index.isin(ret)]
results, teams = matches.generate_matches_dict(df, betexp, teamsd, logger=logger,                                         
                                         flexible_date=True,
                                         min_team_rigid_points=1, 
                                         min_team_flex_points=2,
                                         return_teams=True)
for k, v in teams.items(): teamsd[k] |= v
ret.update(results)

# Discover some new teams
# (close date) (same score) (1 team the same) (other team whatever)*
df = df[~df.index.isin(ret)]
results, teams = matches.generate_matches_dict(df, betexp, teamsd, logger=logger,                                         
                                         flexible_date=True,
                                         min_team_rigid_points=1, 
                                         min_team_flex_points=1,
                                         return_teams=True)
for k, v in teams.items(): teamsd[k] |= v
ret.update(results)

# Discover some new teams
# (close date) (same score) (2 teams alike)*
df = df[~df.index.isin(ret)]
results, teams = matches.generate_matches_dict(df, betexp, teamsd, logger=logger,                                         
                                         flexible_date=True,
                                         min_team_rigid_points=0, 
                                         min_team_flex_points=2,
                                         return_teams=True)
for k, v in teams.items(): teamsd[k] |= v
ret.update(results)

# Discover some new scores
# (close date) (different score) (2 teams the same)
df = df[~df.index.isin(ret)]
results, teams = matches.generate_matches_dict(df, betexp, teamsd, logger=logger,                                         
                                         flexible_date=True,
                                         min_team_rigid_points=2, 
                                         min_team_flex_points=2,
                                         ignore_score=True, 
                                         return_teams=True)
for k, v in teams.items(): teamsd[k] |= v
ret.update(results)

# Discover some new scores
# (close date) (different score) (2 teams alike)
df = df[~df.index.isin(ret)]
results, teams = matches.generate_matches_dict(df, betexp, teamsd, logger=logger,                                         
                                         flexible_date=True,
                                         min_team_rigid_points=0, 
                                         min_team_flex_points=2,
                                         ignore_score=True, 
                                         return_teams=True)
for k, v in teams.items(): teamsd[k] |= v
ret.update(results)

# if we try only 1 team alike we get loads of bad results

INFO:root:-----
INFO:root:Generating matches dict...
INFO:root:There are 5856 loteca matches to find
INFO:root:ignore_date=False
INFO:root:flexible_date=True
INFO:root:ignore_score=False
INFO:root:ignore_score_if_mod=False
INFO:root:min_team_rigid_points=2
INFO:root:min_team_flex_points=2
INFO:root:on match 1/5856
INFO:root:on match 501/5856
INFO:root:on match 1001/5856
INFO:root:on match 1501/5856
INFO:root:on match 2001/5856
INFO:root:on match 2501/5856
INFO:root:on match 3001/5856
INFO:root:on match 3501/5856
INFO:root:on match 4001/5856
INFO:root:on match 4501/5856
INFO:root:on match 5001/5856
INFO:root:on match 5501/5856
INFO:root:3772 matches linked
INFO:root:0 teams found
INFO:root:defaultdict(<class 'set'>, {})
INFO:root:-----
INFO:root:Generating matches dict...
INFO:root:There are 2084 loteca matches to find
INFO:root:ignore_date=False
INFO:root:flexible_date=True
INFO:root:ignore_score=False
INFO:root:ignore_score_if_mod=False
INFO:root:min_team_rigid_points=1
INFO:root:min_

In [11]:
print(teams)
for k, v in results.items():
    display(loteca.loc[k])
    display(betexp[betexp.id == v])

defaultdict(<class 'set'>, {})


In [20]:
# find cancelled matches
# we only find 2, but, that's okay

df1 = loteca0
df2 = betexp[betexp.scoremod.isin(['ABN.', 'AWA.', 'CAN.', 'INT.', 'POSTP.', 'WO.'])]

results, teams = matches.generate_matches_dict(df1, df2, teamsd, logger=logger,    
                                         ignore_date=True,
                                         ignore_score=True,
                                         min_team_rigid_points=2,
                                         min_team_flex_points=2,
                                         return_teams=True)

for k, v in teams.items(): teamsd[k] |= v
ret.update(results)

INFO:root:-----
INFO:root:Generating matches dict...
INFO:root:There are 52 loteca matches to find
INFO:root:ignore_date=True
INFO:root:flexible_date=False
INFO:root:ignore_score=True
INFO:root:min_team_rigid_points=2
INFO:root:min_team_flex_points=2
INFO:root:on match 1/52
INFO:root:2 matches linked
INFO:root:0 teams found:
INFO:root:defaultdict(<class 'set'>, {})
INFO:root:-----


In [15]:
print(teams)
for k, v in results.items():
    display(loteca.loc[k])
    display(betexp[betexp.id == v])

defaultdict(<class 'set'>, {})


roundno                730
gameno                  14
date                   NaT
teamH       CHAPECOENSE/SC
goalsH                   0
teamA          ATLÉTICO/MG
goalsA                   1
happened             False
Name: 10219, dtype: object

Unnamed: 0,id,league_category,date,teamH,teamA,score,scoremod,goalsH,goalsA
71957,hhdByMIg,brazil,2016-12-11,Chapecoense-SC,Atletico-MG,,AWA.,,


roundno                748
gameno                  11
date                   NaT
teamH             CEARÁ/CE
goalsH                   0
teamA       FERROVIÁRIO/CE
goalsA                   1
happened             False
Name: 10468, dtype: object

Unnamed: 0,id,league_category,date,teamH,teamA,score,scoremod,goalsH,goalsA
79004,tSaSjsMf,brazil,2017-05-07,Ceara,Ferroviario,,CAN.,,


In [16]:
del ret[10219]  # chapecoense vs atlético would happen in another date

In [19]:
print(loteca0.shape[0])
print(loteca1.shape[0])



52
5856


# Found names

- AMERICANO/RJ -> Americano FC
- ATHLETIC BILBAO/ESP -> Ath Bilbao
- ATLÉTICO/BA -> Alagoinhas
- BOA ESPORTE CLUBE/MG -> Boa
- BOA ESPORTE/MG -> Boa
- BOCA JÚNIOR/SE -> Boca Junior
- BOLOGNA FC/ITA -> Bologna
- BORUSSIA DORTMUND/DEU -> Dortmund
- BRAGANTINO/SP -> Bragantino
- BRASIL/RS -> Brasil de Pelotas
- CAGLIARI CALCIO/ITA -> Cagliari
- CAMBURIÚ/SC -> Camboriu
- CAXIAS/RS -> SER Caxias
- CELTA DE VIGO/ESP -> Celta Vigo
- CENTRAL/PE -> Central SC
- COLO COLO/BA -> Colo C.
- COLÔNIA/DEU -> FC Koln
- CRUZEIRO/RS -> EC Cruzeiro
- CUIABÁ/MT -> Cuiaba Esporte
- DEPORTI LA CORUNA/ESP -> Dep. La Coruna
- DEPORTIVO LA CORUÑA/ESP -> Dep. La Coruna
- ESPORTIVO/RS -> Bento Goncalves
- ESTANCIANO/SE -> Estanciano EC
- FC BARCELONA/ESP -> Barcelona
- FC BORDEAUX/FRA -> Bordeaux
- FERNANDÓPOLIS/SP SUB 20 -> Fernandopolis U20
- FLUMINENSE/BA -> Fluminense de Feira
- FORMOSA/GO -> Bosque Formosa
- GRÊMIO BARUERI/SP -> Barueri
- GRÊMIO PRUDENTE/SP -> Barueri
- GUARANI/CE -> Guarani de Juazeiro
- GUARANI/MG -> Guarani EC
- GUARANI/SC -> Guarani de Palhoca
- HERTHA BERLIM/DEU -> Hertha Berlin
- INTER DE MILÃO/ITA -> Inter
- IPATINGA/MG -> Betim
- ITUIUTABA/MG -> Boa
- JUVENTUS FC/ITA -> Juventus
- JUVENTUS/SC -> Gremio Juventus
- MILAN/ITA -> AC Milan
- MONCHENGLADBACH/DEU -> B. Monchengladbach
- MOTO CLUBE/MA -> Moto Club
- NEW CASTLE/ING -> Newcastle
- OLYMPIQUE DE NICE/FRA -> Nice
- OLYMPIQUE LYONNAIS/FRA -> Lyon
- PALMEIRA/RN -> Palmeira de Una
- PARIS SAINT-GERMAIN/FRA -> Paris SG
- PORTO/PE -> CA Porto
- PORTUGUESA DESPORTOS/SP -> Portuguesa
- RIO BRANCO/AC -> Rio Branco
- RIO BRANCO/AC SUB 20 -> Rio Branco AC U20
- RIO BRANCO/ES -> Rio Branco ES
- ROMA/ITA -> AS Roma
- SAINT-ÉTIENNE/FRA -> St Etienne
- SANTA CRUZ/RN -> Santa Cruz de Natal
- SCHALKE 04/DEU -> Schalke
- SOC DEPORT EIBAR/ESP -> Eibar
- SOCIETA SPO LAZIO/ITA -> Lazio
- SOROCABA/SP -> Atl. Sorocaba
- SPORT/PE -> Sport Recife
- STOKE CITY/ING -> Stoke
- SÃO  CAETANO/SP -> Sao Caetano
- SÃO JOSÉ (PA) /RS -> EC Sao Jose
- SÃO JOSÉ/RS -> EC Sao Jose
- TIGRES DO BRASIL/RJ -> Tigres Brasil
- TSG HOFFENHEIM/DEU -> Hoffenheim
- TUPÃ/SP SUB 20 -> Tupa U20
- VALENCIA CLUB/ESP -> Valencia
- VASCO DA GAMA/RJ -> Vasco
- VASCO/RJ -> Vasco
- VILA NOVA/GO -> Vila Nova FC
- VILA NOVA/MG -> Villa Nova MG
- VITORIA/PE -> Academica Vitoria
- WEST BROMWICH/ING -> West Brom
- XV DE JAÚ/SP SUB 20 -> XV de Jau U20
- XV PIRACICABA/SP -> Piracicaba
- YPIRANGA/RS -> Ypiranga FC
- ÁGUIA/PA -> Aguia De Maraba

# Some failed BetExplorer names

Not all BetExplorer team names are right. Here are some of them that were captured in this algorithm. These teams have a name change in their history, this is the reason they are here.

In [None]:
# http://www.betexplorer.com/soccer/brazil/serie-b-2009/america-rn-betim/zuc6o4ea/
# it's Ipatinga, not Betim
betexp.loc[22800]

In [None]:
# http://www.betexplorer.com/soccer/brazil/campeonato-paulista-2010/santo-andre-barueri/d4IyAGi2/
# It's Grêmio Prudente, not Barueri
betexp.loc[18934]

In [None]:
# http://www.betexplorer.com/soccer/brazil/serie-b-2011/barueri-portuguesa/4rHNNkEH/
# This one is right, but it contradicts with the example above
betexp.loc[32436]