# **Setup**

### **Libraries**

In [121]:
import pandas as pd
from tqdm.auto import tqdm 
import re 

### **Data**

In [122]:
games_df = pd.read_csv('../data/processed_2019_06_df.csv')
games_df = games_df[['user_name', 'user_elo', 'opening_code', 'opening_name', 'pgn']]
games_df['opening_category'] = games_df['opening_code'].apply(lambda x: x[0])

print('Number of Rows :', len(games_df))

games_df.head(3)

Number of Rows : 97255


Unnamed: 0,user_name,user_elo,opening_code,opening_name,pgn,opening_category
0,bkrrhanife2,1772,D02,Queen's Pawn Game: London System,1. d4 { [%eval 0.25] [%clk 0:10:00] } 1... d5 ...,D
1,Chessbullets,1828,B08,"Pirc Defense: Classical Variation, Schlechter ...",1. e4 { [%eval 0.24] [%clk 0:10:00] } 1... g6 ...,B
2,Tweaker,1407,C21,Danish Gambit,1. e4 { [%eval 0.24] [%clk 0:00:30] } 1... e5 ...,C


# **Code**

### **Utils**

In [123]:
def get_substring(start_char, end_char, text):
    '''
    Helper function for unpack_game()
    '''

    search_string = start_char + '(.+?)' + end_char
    result = re.search(search_string, text)

    if result:
        return result.group(1)
    else:
        return None 

### **Data Exploration**

#### Unpacking PGN 

In [124]:
sample_pgn = games_df.iloc[2]['pgn']
moves = sample_pgn.split('. ')
moves[1:21]

['e4 { [%eval 0.24] [%clk 0:00:30] } 1..',
 'e5 { [%eval 0.37] [%clk 0:00:30] } 2',
 'd4 { [%eval -0.11] [%clk 0:00:33] } 2..',
 'exd4 { [%eval 0.0] [%clk 0:00:33] } 3',
 'c3 { [%eval -0.42] [%clk 0:00:37] } 3..',
 'Qe7 { [%eval -0.32] [%clk 0:00:36] } 4',
 'Bd3 { [%eval -0.52] [%clk 0:00:38] } 4..',
 'd5 { [%eval -0.37] [%clk 0:00:39] } 5',
 'Qa4+? { [%eval -2.13] [%clk 0:00:37] } 5..',
 'c6? { [%eval -0.78] [%clk 0:00:41] } 6',
 'Nf3?? { [%eval -5.75] [%clk 0:00:35] } 6..',
 'dxe4 { [%eval -5.68] [%clk 0:00:44] } 7',
 'Bg5 { [%eval -5.43] [%clk 0:00:28] } 7..',
 'Qe6? { [%eval -2.59] [%clk 0:00:44] } 8',
 'O-O { [%eval -2.49] [%clk 0:00:24] } 8..',
 'exf3 { [%eval -2.71] [%clk 0:00:43] } 9',
 'Bc4?? { [%eval -10.42] [%clk 0:00:13] } 9..',
 'Qg4 { [%eval -10.35] [%clk 0:00:42] } 10',
 'Re1+ { [%eval -10.26] [%clk 0:00:15] } 10..',
 'Be7 { [%eval -10.36] [%clk 0:00:44] } 11']

In [125]:
moves[-5:]

['Qg4 { [%eval -10.35] [%clk 0:00:42] } 10',
 'Re1+ { [%eval -10.26] [%clk 0:00:15] } 10..',
 'Be7 { [%eval -10.36] [%clk 0:00:44] } 11',
 'Bxe7?! { [%eval #-1] [%clk 0:00:16] } 11..',
 'Qxg2# { [%clk 0:00:42] } 0-1\n']

In [126]:
def unpack_moves(pgn_str):
    ''' 
    Returns a list of tuples, where each tuples is 
    (move, evaluation at move)
    '''
    moves = pgn_str.split('. ')[1:]
    unpacked_moves = []
    for move_str in moves:
        move = get_substring('', ' {', move_str)
        eval = get_substring('eval ', '] ', move_str)

        if eval == None:
            pass
        elif '#' in eval:
            eval = float(eval[1:])
        else:
            eval = float(eval)
        
        unpacked_moves.append((move, eval))

    return unpacked_moves

In [127]:
sample_unpacked_moves = unpack_moves(sample_pgn)
sample_unpacked_moves[:-5]

[('e4', 0.24),
 ('e5', 0.37),
 ('d4', -0.11),
 ('exd4', 0.0),
 ('c3', -0.42),
 ('Qe7', -0.32),
 ('Bd3', -0.52),
 ('d5', -0.37),
 ('Qa4+?', -2.13),
 ('c6?', -0.78),
 ('Nf3??', -5.75),
 ('dxe4', -5.68),
 ('Bg5', -5.43),
 ('Qe6?', -2.59),
 ('O-O', -2.49),
 ('exf3', -2.71),
 ('Bc4??', -10.42)]

#### Feature Extraction

For a single tuple

In [136]:
def get_square(move_str):
    for i, c in enumerate(move_str):
        if c.isdigit():
            return move_str[i-1:i+1]
    
    return '!' 

def get_move_attributes(move_tuple):
    move = move_tuple[0]
    if move == None:
        return ('!', 0, 0, 0)

    capture = int('x' in move)
    check = int(('+' in move) or ('#' in move))
    pawn_density = 0 

    piece = move[0]
    if piece not in ['K', 'Q', 'R', 'B', 'N']:
        piece = 'P'

    square = get_square(move)

    central_squares = ['c3', 'c4', 'c5', 'c6', 
                       'd3', 'd4', 'd5', 'd6',  
                       'e3', 'e4', 'e5', 'e6',
                       'f3', 'f4', 'f5', 'f6']

    if piece == 'P' and square in central_squares:
        pawn_density += 1

    return (piece, capture, check, pawn_density)

In [137]:
get_move_attributes(('exe4+', 0.1))

('P', 1, 1, 1)

For a list of tuples

In [140]:
def get_move_attributes_game(move_lst):
    piece_movements = {'K': 0, 'Q': 0, 'R': 0, 'B': 0, 'N': 0, 'P': 0}
    captures = 0
    checks = 0
    pawn_density = 0

    attributes_5 = [0,0,0,0,0,0,0,0,0]
    attributes_10 = [0,0,0,0,0,0,0,0,0]
    attributes_15 = [0,0,0,0,0,0,0,0,0]
    attributes_final = []

    if len(move_lst) < 19 and len(move_lst) % 2 == 0:
        last_3_evals = [move_lst[-6][1],move_lst[-4][1],move_lst[-2][1]]
    elif len(move_lst) < 19 and len(move_lst) % 2 != 0:
        last_3_evals = [move_lst[-5][1],move_lst[-3][1],move_lst[-1][1]]
    else:
        last_3_evals = [move_lst[14][1],move_lst[16][1],move_lst[18][1]]

    last_3_evals = [x for x in last_3_evals if x is not None]
    opening_eval = round(sum(last_3_evals) / len(last_3_evals), 2)
    
    for move_num, move in enumerate(move_lst):

        if move_num % 2 != 0:
            continue
        else:
            move_num = (move_num / 2) + 1

        piece, capture, check, pawn_density = get_move_attributes(move)

        if piece == '!':
            continue 

        piece_movements[piece] += 1
        captures += capture
        checks += check
        pawn_density += pawn_density

        if move_num == 5:
            attributes_5 = [piece_movements[piece] / 5 for piece in piece_movements.keys()]
            attributes_5 += [captures / 5, checks / 5, pawn_density / 5]
            piece_movements = {'K': 0, 'Q': 0, 'R': 0, 'B': 0, 'N': 0, 'P': 0}
            captures = 0
            checks = 0
            pawn_density = 0
        if move_num == 10:
            attributes_10 = [piece_movements[piece] / 10 for piece in piece_movements.keys()]
            attributes_10 += [captures / 10, checks / 10, pawn_density / 10]
            piece_movements = {'K': 0, 'Q': 0, 'R': 0, 'B': 0, 'N': 0, 'P': 0}
            captures = 0
            checks = 0
            pawn_density = 0
        if move_num == 15:
            attributes_15 = [piece_movements[piece] / 15 for piece in piece_movements.keys()]
            attributes_15 += [captures / 15, checks / 15, pawn_density / 15]
            piece_movements = {'K': 0, 'Q': 0, 'R': 0, 'B': 0, 'N': 0, 'P': 0}
            captures = 0
            checks = 0
            pawn_density = 0

    moves = int(len(move_lst) / 2)
    attributes_final = [piece_movements[piece] / moves for piece in piece_movements.keys()]
    attributes_final += [captures / moves, checks / moves, pawn_density / moves]
    attributes_final = [round(x, 2) for x in attributes_final]

    try:
        return attributes_5, attributes_10, attributes_15, attributes_final, opening_eval
    except:
        print(last_3_evals)
        print(move_lst)
        return attributes_5, attributes_10, attributes_15, attributes_final, opening_eval


In [141]:
get_move_attributes_game(sample_unpacked_moves)

([0.0, 0.2, 0.0, 0.2, 0.0, 0.6, 0.0, 0.2, 0.0],
 [0.0, 0.0, 0.1, 0.2, 0.1, 0.1, 0.0, 0.1, 0.0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0.0, 0.0, 0.0, 0.09, 0.0, 0.0, 0.09, 0.0, 0.0],
 -7.72)

#### Applying to Pandas DF 

Unpack Moves

In [142]:
games_df['unpacked_moves'] = games_df.apply(lambda row: unpack_moves(row['pgn']), axis = 1)
games_df[['unpacked_moves']].head(3)

Unnamed: 0,unpacked_moves
0,"[(d4, 0.25), (d5, 0.25), (Bf4, 0.0), (Nf6, 0.0..."
1,"[(e4, 0.24), (g6, 0.46), (d4, 0.28), (Bg7, 0.3..."
2,"[(e4, 0.24), (e5, 0.37), (d4, -0.11), (exd4, 0..."


Get move attributes

In [143]:
games_df['game_attributes'] = games_df.apply(lambda row: get_move_attributes_game(row['unpacked_moves']), axis = 1)
games_df[['game_attributes']].head(3)

Unnamed: 0,game_attributes
0,"([0.0, 0.0, 0.0, 0.4, 0.2, 0.4, 0.2, 0.0, 0.0]..."
1,"([0.0, 0.0, 0.0, 0.0, 0.4, 0.6, 0.0, 0.0, 0.0]..."
2,"([0.0, 0.2, 0.0, 0.2, 0.0, 0.6, 0.0, 0.2, 0.0]..."


#### Featurized Dataframe

In [144]:
games_df['move5'], games_df['move10'], games_df['move15'], games_df['final'], games_df['opening_eval'] = zip(*games_df['game_attributes'])
games_df['move5_K'], games_df['move5_Q'], games_df['move5_R'], games_df['move5_B'], games_df['move5_N'], games_df['move5_P'], games_df['move5_captures'], games_df['move5_checks'], games_df['move5_pawn_density'] = zip(*games_df['move5'])
games_df['move10_K'], games_df['move10_Q'], games_df['move10_R'], games_df['move10_B'], games_df['move10_N'], games_df['move10_P'], games_df['move10_captures'], games_df['move10_checks'], games_df['move10_pawn_density'] = zip(*games_df['move10'])
games_df['move15_K'], games_df['move15_Q'], games_df['move15_R'], games_df['move15_B'], games_df['move15_N'], games_df['move15_P'], games_df['move15_captures'], games_df['move15_checks'], games_df['move15_pawn_density'] = zip(*games_df['move15'])
games_df['final_K'], games_df['final_Q'], games_df['final_R'], games_df['final_B'], games_df['final_N'], games_df['final_P'], games_df['final_captures'], games_df['final_checks'], games_df['final_pawn_density'] = zip(*games_df['final'])

feature_df = games_df[['user_name', 'user_elo', 
                       'opening_code', 'opening_name', 'opening_category', 'opening_eval', 
                       'move5_K', 'move5_Q', 'move5_R', 'move5_B', 'move5_N', 'move5_P', 
                       'move5_captures', 'move5_checks', 'move5_pawn_density', 
                       'move10_K', 'move10_Q', 'move10_R', 'move10_B', 'move10_N', 'move10_P', 
                       'move10_captures', 'move10_checks', 'move10_pawn_density', 
                       'move15_K', 'move15_Q', 'move15_R', 'move15_B', 'move15_N', 'move15_P', 
                       'move15_captures', 'move15_checks', 'move15_pawn_density', 
                       'final_K', 'final_Q', 'final_R', 'final_B', 'final_N', 'final_P',
                       'final_captures', 'final_checks', 'final_pawn_density']]
feature_df.head(5)

Unnamed: 0,user_name,user_elo,opening_code,opening_name,opening_category,opening_eval,move5_K,move5_Q,move5_R,move5_B,...,move15_pawn_density,final_K,final_Q,final_R,final_B,final_N,final_P,final_captures,final_checks,final_pawn_density
0,bkrrhanife2,1772,D02,Queen's Pawn Game: London System,D,-3.98,0.0,0.0,0.0,0.4,...,0.0,0.07,0.0,0.07,0.0,0.14,0.0,0.14,0.0,0.0
1,Chessbullets,1828,B08,"Pirc Defense: Classical Variation, Schlechter ...",B,0.47,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.17,0.14,0.21,0.0,0.14,0.24,0.0
2,Tweaker,1407,C21,Danish Gambit,C,-7.72,0.0,0.2,0.0,0.2,...,0.0,0.0,0.0,0.0,0.09,0.0,0.0,0.09,0.0,0.0
3,monopunkt,1238,C46,Three Knights Opening #2,C,0.99,0.0,0.0,0.0,0.0,...,0.0,0.0,0.04,0.08,0.0,0.16,0.12,0.08,0.0,0.0
4,babar295,1064,C00,French Defense: Knight Variation,C,1.79,0.0,0.0,0.0,0.0,...,0.0,0.11,0.0,0.26,0.06,0.11,0.03,0.2,0.17,0.0


In [145]:
feature_df.columns

Index(['user_name', 'user_elo', 'opening_code', 'opening_name',
       'opening_category', 'opening_eval', 'move5_K', 'move5_Q', 'move5_R',
       'move5_B', 'move5_N', 'move5_P', 'move5_captures', 'move5_checks',
       'move5_pawn_density', 'move10_K', 'move10_Q', 'move10_R', 'move10_B',
       'move10_N', 'move10_P', 'move10_captures', 'move10_checks',
       'move10_pawn_density', 'move15_K', 'move15_Q', 'move15_R', 'move15_B',
       'move15_N', 'move15_P', 'move15_captures', 'move15_checks',
       'move15_pawn_density', 'final_K', 'final_Q', 'final_R', 'final_B',
       'final_N', 'final_P', 'final_captures', 'final_checks',
       'final_pawn_density'],
      dtype='object')

## **TO DO**

- Build recommender system

#### Write to CSV 

In [146]:
feature_df.to_csv('../data/feature_df.csv', index = False)