Some of the Loteca matches have wrong Timestamps with them.

Let's examine this and try to come up with a way for retrieving the dates without using the wrong timestamps!

In [106]:
import pickle

import numpy as np
import pandas as pd

rounds = pickle.load(open('../data/raw/loteca_site.pkl', mode='rb'))
matches = pd.read_pickle('../data/pre/lotecas_matches.pkl')  # already preprocessed

In [107]:
rounds[0]

{'colunaDois': True,
 'colunaMeio': False,
 'colunaUm': False,
 'concurso': 1,
 'concursoAnterior': None,
 'concursosProgramacao': None,
 'de_observacao': 'Estimativa prêmio para o próximo concurso: 14 pontos: 130 MIL.',
 'dtApuracao': 1014001200000,
 'dtApuracaoStr': '18/02/2002',
 'dt_jogo': 1013914800000,
 'dt_proximo_concurso': None,
 'dt_proximo_concursoStr': None,
 'error': False,
 'forward': None,
 'ganhadoresPorUf': [{'coLoteria': '9',
   'concursoAnterior': None,
   'forward': None,
   'icCanalEletronico': None,
   'mensagens': [],
   'noCidade': None,
   'nuConcurso': '1',
   'proximoConcurso': None,
   'qtGanhadores': 1,
   'sgUf': 'BA',
   'total': None},
  {'coLoteria': '9',
   'concursoAnterior': None,
   'forward': None,
   'icCanalEletronico': None,
   'mensagens': [],
   'noCidade': None,
   'nuConcurso': '1',
   'proximoConcurso': None,
   'qtGanhadores': 1,
   'sgUf': 'SP',
   'total': None}],
 'icJogo': '1',
 'ic_sorteio': '0',
 'jogos': [{'colunaDois': True,
   'co

In [108]:
matches.head()

Unnamed: 0,roundno,gameno,date,teamH,goalsH,teamA,goalsA,happened
0,1,1,2002-02-17,FLAMENGO/RJ,2,S. PAULO/SP,4,True
1,1,2,2002-02-17,VASCO/RJ,3,AMERICANO/RJ,0,True
2,1,3,2002-02-17,SANTOS/SP,2,SÃO CAETANO/SP,1,True
3,1,4,2002-02-17,P. DESPORTOS/SP,1,CORINTHIANS/SP,4,True
4,1,5,2002-02-16,CRUZEIRO/MG,7,AMÉRICA/MG,0,True


In [109]:
# we can use both the timestamp or string

for round in rounds:
    roundno = round['concurso']    
    rounddate_ts = round['dtApuracao']
    rounddate_str = round['dtApuracaoStr']
    
    date1 = pd.to_datetime(rounddate_ts, unit='ms').normalize()
    date2 = pd.to_datetime(rounddate_str, dayfirst=True)
    
    assert date1 == date2

In [113]:
WEEKDAYS = {
    'Segunda-feira': 0,
    'Terça-feira': 1,
    'Quarta-feira': 2,
    'Quinta-feira': 3,
    'Sexta-feira': 4,
    'Sábado': 5,
    'Domingo': 6,
    '': np.nan  # Sorteio
}

def find_previous_weekday(date, weekday):
    if date.dayofweek > weekday:
        daysbefore = date.dayofweek - weekday
    else:
        daysbefore = date.dayofweek + (7 - weekday)
        
    return date - pd.Timedelta(days=daysbefore)



matches = []
days = set()
for round in rounds:
    roundno = round['concurso']
    round_date = pd.to_datetime(round['dtApuracaoStr'], dayfirst=True)  # easier
    
    for match in round['jogos']:
        match_weekday = WEEKDAYS[match['diaDaSemana']]
        match_happened = ~np.isnan(match_weekday)
        
        if match_happened:            
            matchno = match['icJogo']
            match_date = find_previous_weekday(round_date, match_weekday)
        
            date1 = match_date
            date2 = pd.to_datetime(match['dt_jogo'], unit='ms').normalize()
            
            if date1 != date2:
                if abs(date1 - date2).days != 7:
                    print(roundno, matchno)
                    print(date1, date2)
                    print()

548 1
2013-03-03 00:00:00 2013-02-03 00:00:00

548 2
2013-03-02 00:00:00 2013-02-02 00:00:00

548 3
2013-03-03 00:00:00 2013-02-03 00:00:00

548 4
2013-03-02 00:00:00 2013-02-02 00:00:00

548 5
2013-03-03 00:00:00 2013-02-03 00:00:00

548 7
2013-03-03 00:00:00 2013-02-03 00:00:00

548 8
2013-03-02 00:00:00 2013-02-02 00:00:00

548 9
2013-03-03 00:00:00 2013-02-03 00:00:00

548 10
2013-03-02 00:00:00 2013-02-02 00:00:00

548 11
2013-03-03 00:00:00 2013-02-03 00:00:00

548 12
2013-03-03 00:00:00 2013-02-03 00:00:00

548 13
2013-03-03 00:00:00 2013-02-03 00:00:00

548 14
2013-03-03 00:00:00 2013-02-03 00:00:00



We found that:
- some matches happen on the day of the draw (example: round 18 matchno 3)
- some matches happen one week before the draw (example: round 563 matchno 4)

I will just believe those matches that are one week later or one week before. This leaves us with round number 548. All matches happened in march, but the timestamps tell us they happened in february. Let's just fix this manually.

And we're done!