In [1]:
# Objective : create ETL process for Chess.com data

In [1]:
# Imports 
import chessdotcom as chess
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_columns',200)

In [2]:
# Define player profile: warning it is case sensitive
player_profile = 'aledbs' 

In [3]:
def extract_pgn_values(pgn:str):
  '''
  Convert a pgn string into a list of all values

      Parameters:
          pgn_str: A pgn string 
      
      Returns: 
          pgn_values (list): List containing value of a pgn string 
  '''
  pgn = str(pgn).split('\n\n')[0]
  pattern = '"([^"]*)"'
  pgn_values = re.findall(pattern, pgn)

  return pgn_values

def extract_pgn_columns(pgn:str):
  '''
  Convert a pgn string into a list of keys of a pgn string 

      Parameters:
          pgn_str: A pgn string 
      
      Returns: 
          pgn_columns (list): List containing keys of a pgn string 
  '''
  pgn = pgn.split('\n\n')[0]
  pattern = '\[([^\s]*)'
  pgn_columns = re.findall(pattern, pgn)
  return pgn_columns

def convert_pgn_to_columns(df:pd.DataFrame):
  '''
    Extract pgn information into new columns of existing DataFrame  

        Parameters:
            df: DataFrame with games data including a column name 'pgn' 
        
        Returns: 
            concat_pgn_df (DataFrame): Input Dataframe with new columns extracted from pgn 
    '''
  game_df = df.copy()
  game_df = game_df[game_df['rules'] != 'bughouse']

  game_df['pgn2'] = game_df['pgn'].apply(extract_pgn_values)


  expanded_game_df = game_df.pgn2.apply(pd.Series)
  expanded_game_df.columns = extract_pgn_columns(game_df.pgn.iloc[0]) #hyp: all pgn has same column format

  game_df = pd.concat([game_df, expanded_game_df], axis=1)

  game_df['Moves'] = game_df['pgn'].map(lambda x: x.split('\n\n')[1])

  game_df.pop('pgn2')

  return game_df


In [4]:
# def parse_moves(move:str):

#     # Extract clock information
#     # pattern_clk = '{%clk\s(.*?)\}'
#     pattern_clk = r'\[(.*?)\]'
#     clk = re.findall(pattern_clk, move.replace('%clk ', ''))

#     # Extract move information
#     pattern_moves = '\d+\.\s*(.*?)\s*{'
#     moves = re.findall(pattern_moves, move)
#     for index, res in enumerate(moves):
#         moves[index] = res.split(' ')[-1]

#     # Create df with White moves, Withe Clock, Black moves, Black Clock
#     white_moves = [i for index, i in enumerate(moves) if index % 2 == 0]
#     white_clk = [i for index, i in enumerate(clk) if index % 2 == 0]

#     black_moves = [i for index, i in enumerate(moves) if index % 2 == 1]
#     black_clk = [i for index, i in enumerate(clk) if index % 2 == 1]

#     if len(black_moves) < len(white_moves):
#         black_moves.append('')
#         black_clk.append('')


#     game_moves = {'move': list(range(1, len(white_moves)+1)) , 
#                   'WhiteMoves': white_moves, 
#                   'WhiteClock': white_clk, 
#                   'BlackMoves': black_moves, 
#                   'BlackClock': black_clk}
    
#     return game_moves

In [11]:
def extract_time_between_moves_metrics(move:str):

    # Extract clock information
    pattern_clk = r'\[(.*?)\]'
    clk = re.findall(pattern_clk, move.replace('%clk ', ''))

    # Create lists White and Black clocks in seconds
    white_clk = [sum(x * float(t) for x, t in zip([3600,60,1], i.split(":"))) for index, i in enumerate(clk) if index % 2 == 0]
    black_clk = [sum(x * float(t) for x, t in zip([3600,60,1], i.split(":"))) for index, i in enumerate(clk) if index % 2 == 1]

    # compute metrics: avg and std
    avg_time_white = pd.Series(white_clk).diff(periods=-1).mean()
    avg_time_black = pd.Series(black_clk).diff(periods=-1).mean()
    std_time_white = pd.Series(white_clk).diff(periods=-1).std()
    std_time_black = pd.Series(black_clk).diff(periods=-1).std()

    return avg_time_white, avg_time_black, std_time_white, std_time_black

In [12]:
def data_transformation(game_df, player_profile):
    df = game_df.copy()

    df.reset_index(inplace = True, drop = True)

    df.drop(df.loc[df['pgn'].apply(extract_pgn_columns).apply(len) < 21].index, inplace=True) #remove game with a pgn in error

    df[['Date', 
        'UTCDate', 
        'EndDate', 
        'EndTime', 
        'StartTime']] = df[['Date', 
                           'UTCDate', 
                           'EndDate', 
                           'EndTime', 
                           'StartTime']].apply(pd.to_datetime)

    df[['WhiteElo','BlackElo']] = df[['WhiteElo','BlackElo']].astype(int)
    # df['BlackElo'] = df['BlackElo'].astype(int)

    df['TerminationStatus'] = pd.Series(dtype=str)
    df.loc[df['Termination'].str.contains(player_profile), 'TerminationStatus'] = 'Win'
    df.loc[df['Termination'].str.contains('won') & ~df['Termination'].str.contains(player_profile, na=False), 'TerminationStatus'] = 'Lose'
    df.loc[df['Termination'].str.contains('drawn'), 'TerminationStatus'] = 'Draw'
    
    df['PlayerSide'] = pd.Series(dtype=str)
    df.loc[df['White']==player_profile, 'PlayerSide'] = 'White'
    df.loc[df['White']!=player_profile, 'PlayerSide'] = 'Black'

    # Compute Elo
    df['PlayerElo'] = pd.Series(dtype=int)
    df.loc[df['White']==player_profile, 'PlayerElo'] = df['WhiteElo'][df['White']==player_profile]
    df.loc[df['White']!=player_profile, 'PlayerElo'] = df['BlackElo'][df['White']!=player_profile]

    df['OpponentElo'] = pd.Series(dtype=int)
    df.loc[df['White']==player_profile, 'OpponentElo'] = df['BlackElo'][df['White']==player_profile]
    df.loc[df['White']!=player_profile, 'OpponentElo'] = df['WhiteElo'][df['White']!=player_profile]

    df['EloDiff'] = pd.Series(dtype=float)
    df['EloDiff'] = df['PlayerElo'] - df['OpponentElo']

    # Compute accuracie
    df['PlayerAccuracie'] = pd.Series(dtype=float)
    df.loc[df['White']==player_profile, 'PlayerAccuracie'] = df['white_accuracie'][df['White']==player_profile]
    df.loc[df['White']!=player_profile, 'PlayerAccuracie'] = df['black_accuracie'][df['White']!=player_profile]

    df['OpponentAccuracie'] = pd.Series(dtype=float)
    df.loc[df['White']==player_profile, 'OpponentAccuracie'] = df['black_accuracie'][df['White']==player_profile]
    df.loc[df['White']!=player_profile, 'OpponentAccuracie'] = df['white_accuracie'][df['White']!=player_profile]

    df['AccuracieDiff'] = pd.Series(dtype=float)
    df['AccuracieDiff'] = df['PlayerAccuracie'] - df['OpponentAccuracie']

    df['Duration'] = df['EndTime'] - df['StartTime']

    df['TerminationType'] = df['Termination'].apply(lambda x: str(x).split('drawn')[-1] if str(x).split(' won ')[-1] is None else str(x).split(' won ')[-1])
    
    df['Opening'] = df['ECOUrl'].str.replace('https://www.chess.com/openings/','')

    # Compute and add time between moves metrics
    time_metrics = df.Moves.apply(extract_time_between_moves_metrics)
    time_metrics_df = pd.DataFrame(time_metrics.tolist()).rename(columns={0:'AvgTimeWhite'
            , 1: 'AvgTimeBlack'
            , 2: 'StdTimeWhite'
            , 3: 'StdTimeBlack'})
    df = pd.concat([df, time_metrics_df], axis=1)

    df['PlayerAvgTime'] = pd.Series(dtype=float)
    df.loc[df['White']==player_profile, 'PlayerAvgTime'] = df['AvgTimeWhite'][df['White']==player_profile]
    df.loc[df['White']!=player_profile, 'PlayerAvgTime'] = df['AvgTimeBlack'][df['White']!=player_profile]

    df['OpponentAvgTime'] = pd.Series(dtype=float)
    df.loc[df['White']==player_profile, 'OpponentAvgTime'] = df['AvgTimeBlack'][df['White']==player_profile]
    df.loc[df['White']!=player_profile, 'OpponentAvgTime'] = df['AvgTimeWhite'][df['White']!=player_profile]

    df['PlayerStdTime'] = pd.Series(dtype=float)
    df.loc[df['White']==player_profile, 'PlayerStdTime'] = df['StdTimeWhite'][df['White']==player_profile]
    df.loc[df['White']!=player_profile, 'PlayerStdTime'] = df['StdTimeBlack'][df['White']!=player_profile]

    df['OpponentStdTime'] = pd.Series(dtype=float)
    df.loc[df['White']==player_profile, 'OpponentStdTime'] = df['StdTimeBlack'][df['White']==player_profile]
    df.loc[df['White']!=player_profile, 'OpponentStdTime'] = df['StdTimeWhite'][df['White']!=player_profile]

    df['AvgTimeDiff'] = pd.Series(dtype=float)
    df['AvgTimeDiff'] = df['PlayerAvgTime'] - df['OpponentAvgTime']

    df['StdTimeDiff'] = pd.Series(dtype=float)
    df['StdTimeDiff'] = df['PlayerStdTime'] - df['OpponentStdTime']

    # df['Moves'] = df['Moves'].apply(parse_moves)

    return df


In [13]:
game_df = pd.read_pickle('./data/raw_api_data.pkl')

In [14]:
game_df = convert_pgn_to_columns(game_df)

In [15]:
game_df = data_transformation(game_df, player_profile)

  df['Opening'] = df['ECOUrl'].str.replace('https://www.chess.com/openings/','')


In [16]:
game_df.to_pickle("./data/full_games_history.pkl")