# **Setup**

### **Libraries**

In [15]:
import pandas as pd
from tqdm.auto import tqdm 
import re 

### **Data**

In [70]:
games_df = pd.read_csv('../data/processed_2019_06_df.csv')
games_df = games_df[['user_name', 'user_elo', 'opening_code', 'opening_name', 'pgn']]
games_df['opening_category'] = games_df['opening_code'].apply(lambda x: x[0])

print('Number of Rows :', len(games_df))

games_df.head(3)

Number of Rows : 97255


Unnamed: 0,user_name,user_elo,opening_code,opening_name,pgn,opening_category
0,bkrrhanife2,1772,D02,Queen's Pawn Game: London System,1. d4 { [%eval 0.25] [%clk 0:10:00] } 1... d5 ...,D
1,Chessbullets,1828,B08,"Pirc Defense: Classical Variation, Schlechter ...",1. e4 { [%eval 0.24] [%clk 0:10:00] } 1... g6 ...,B
2,Tweaker,1407,C21,Danish Gambit,1. e4 { [%eval 0.24] [%clk 0:00:30] } 1... e5 ...,C


# **Code**

### **Utils**

In [16]:
def get_substring(start_char, end_char, text):
    '''
    Helper function for unpack_game()
    '''

    search_string = start_char + '(.+?)' + end_char
    result = re.search(search_string, text)

    if result:
        return result.group(1)
    else:
        return None 

### **Data Exploration**

#### Unpacking PGN 

In [118]:
sample_pgn = games_df.iloc[2]['pgn']
moves = sample_pgn.split('. ')
moves[1:21]

['e4 { [%eval 0.24] [%clk 0:00:30] } 1..',
 'e5 { [%eval 0.37] [%clk 0:00:30] } 2',
 'd4 { [%eval -0.11] [%clk 0:00:33] } 2..',
 'exd4 { [%eval 0.0] [%clk 0:00:33] } 3',
 'c3 { [%eval -0.42] [%clk 0:00:37] } 3..',
 'Qe7 { [%eval -0.32] [%clk 0:00:36] } 4',
 'Bd3 { [%eval -0.52] [%clk 0:00:38] } 4..',
 'd5 { [%eval -0.37] [%clk 0:00:39] } 5',
 'Qa4+? { [%eval -2.13] [%clk 0:00:37] } 5..',
 'c6? { [%eval -0.78] [%clk 0:00:41] } 6',
 'Nf3?? { [%eval -5.75] [%clk 0:00:35] } 6..',
 'dxe4 { [%eval -5.68] [%clk 0:00:44] } 7',
 'Bg5 { [%eval -5.43] [%clk 0:00:28] } 7..',
 'Qe6? { [%eval -2.59] [%clk 0:00:44] } 8',
 'O-O { [%eval -2.49] [%clk 0:00:24] } 8..',
 'exf3 { [%eval -2.71] [%clk 0:00:43] } 9',
 'Bc4?? { [%eval -10.42] [%clk 0:00:13] } 9..',
 'Qg4 { [%eval -10.35] [%clk 0:00:42] } 10',
 'Re1+ { [%eval -10.26] [%clk 0:00:15] } 10..',
 'Be7 { [%eval -10.36] [%clk 0:00:44] } 11']

In [115]:
moves[-5:]

['Qg4 { [%eval -10.35] [%clk 0:00:42] } 10',
 'Re1+ { [%eval -10.26] [%clk 0:00:15] } 10..',
 'Be7 { [%eval -10.36] [%clk 0:00:44] } 11',
 'Bxe7?! { [%eval #-1] [%clk 0:00:16] } 11..',
 'Qxg2# { [%clk 0:00:42] } 0-1\n']

In [81]:
def unpack_moves(pgn_str):
    ''' 
    Returns a list of tuples, where each tuples is 
    (move, evaluation at move)
    '''
    moves = pgn_str.split('. ')[1:]
    unpacked_moves = []
    for move_str in moves:
        move = get_substring('', ' {', move_str)
        eval = get_substring('eval ', '] ', move_str)

        if eval == None:
            pass
        elif '#' in eval:
            eval = float(eval[1:])
        else:
            eval = float(eval)
        
        unpacked_moves.append((move, eval))

    return unpacked_moves

In [106]:
sample_unpacked_moves = unpack_moves(sample_pgn)
sample_unpacked_moves[:-5]

[('e4', 0.24),
 ('e5', 0.37),
 ('Nf3', 0.26),
 ('Nc6', 0.15),
 ('Bc4', 0.21),
 ('h6', 0.7),
 ('b3?', -0.4),
 ('Bc5?!', 0.21),
 ('Bb2', -0.08),
 ('d6', -0.17),
 ('h3', -0.29),
 ('Nf6', -0.64),
 ('d3', -0.15),
 ('Na5', -0.07),
 ('Qd2?!', -0.6),
 ('Nxc4', -0.5),
 ('dxc4?', -3.39),
 ('a6?', -0.95),
 ('Nc3', -0.86),
 ('c6?', 0.35),
 ('O-O-O', 0.39),
 ('Be6', 0.34),
 ('g4?!', -0.35),
 ('Bb4', -0.38),
 ('Qd3', -0.21),
 ('Qc7?!', 0.43),
 ('a3', 0.07),
 ('Bxc3?!', 0.58),
 ('Bxc3', 0.75),
 ('O-O-O', 0.89),
 ('Bb4?', -0.27),
 ('a5?!', 0.31),
 ('Bxd6', 0.09),
 ('Rxd6', 0.21),
 ('Qxd6', 0.19),
 ('Qxd6', 0.21),
 ('Rxd6', 0.21),
 ('Nxe4', 0.3),
 ('Rdd1?!', -0.64),
 ('Nxf2', -0.54),
 ('Nxe5', -0.4),
 ('Nxd1', -0.33),
 ('Rxd1', -0.35),
 ('f6', 0.0),
 ('Ng6', -0.1),
 ('Rd8', 0.0),
 ('Re1', 0.0),
 ('Re8', 0.0),
 ('Nf4', -0.29),
 ('Bf7', -0.09),
 ('Rd1', -0.34),
 ('Re4', -0.22),
 ('Nd3', -0.23),
 ('Re3', -0.17),
 ('h4', -0.58),
 ('Rg3', -0.41),
 ('b4?', -3.29),
 ('a4?', -1.65),
 ('b5?!', -2.5),
 ('cxb5', 

#### Feature Extraction

For a single tuple

In [83]:
def get_square(move_str):
    for i, c in enumerate(move_str):
        if c.isdigit():
            return move_str[i-1:i+1]
    
    return '!' 

def get_move_attributes(move_tuple):
    move = move_tuple[0]
    if move == None:
        return ('!', 0, 0, 0)

    capture = int('x' in move)
    check = int(('+' in move) or ('#' in move))
    pawn_density = 0 

    piece = move[0]
    if piece not in ['K', 'Q', 'R', 'B', 'N']:
        piece = 'P'

    square = get_square(move)

    central_squares = ['c3', 'c4', 'c5', 'c6', 
                       'd3', 'd4', 'd5', 'd6',  
                       'e3', 'e4', 'e5', 'e6',
                       'f3', 'f4', 'f5', 'f6']

    if piece == 'P' and square in central_squares:
        pawn_density += 1

    return (piece, capture, check, pawn_density)

In [84]:
get_move_attributes(('exe4+', 0.1))

('P', 1, 1, 1)

For a list of tuples

In [109]:
def get_move_attributes_game(move_lst):
    piece_movements = {'K': 0, 'Q': 0, 'R': 0, 'B': 0, 'N': 0, 'P': 0}
    captures = 0
    checks = 0
    pawn_density = 0

    attributes_5 = [0,0,0,0,0,0,0,0,0]
    attributes_10 = [0,0,0,0,0,0,0,0,0]
    attributes_15 = [0,0,0,0,0,0,0,0,0]
    attributes_final = []

    if len(move_lst) < 19 and len(move_lst) % 2 == 0:
        last_3_evals = [move_lst[-6][1],move_lst[-4][1],move_lst[-2][1]]
    elif len(move_lst) < 19 and len(move_lst) % 2 != 0:
        last_3_evals = [move_lst[-5][1],move_lst[-3][1],move_lst[-1][1]]
    else:
        last_3_evals = [move_lst[14][1],move_lst[16][1],move_lst[18][1]]

    last_3_evals = [x for x in last_3_evals if x is not None]
    opening_eval = round(sum(last_3_evals) / len(last_3_evals), 2)
    
    for move_num, move in enumerate(move_lst):

        if move_num % 2 != 0:
            continue
        else:
            move_num = (move_num / 2) + 1

        piece, capture, check, pawn_density = get_move_attributes(move)

        if piece == '!':
            continue 

        piece_movements[piece] += 1
        captures += capture
        checks += check
        pawn_density += pawn_density

        if move_num == 5:
            attributes_5 = [piece_movements[piece] for piece in piece_movements.keys()]
            attributes_5 += [captures, checks, pawn_density]
        if move_num == 10:
            attributes_10 = [piece_movements[piece] for piece in piece_movements.keys()]
            attributes_10 += [captures, checks, pawn_density]
        if move_num == 15:
            attributes_15 = [piece_movements[piece] for piece in piece_movements.keys()]
            attributes_15 += [captures, checks, pawn_density]

    attributes_final = [piece_movements[piece] for piece in piece_movements.keys()]
    attributes_final += [captures, checks, pawn_density]

    try:
        return attributes_5, attributes_10, attributes_15, attributes_final, opening_eval
    except:
        print(last_3_evals)
        print(move_lst)
        return attributes_5, attributes_10, attributes_15, attributes_final, opening_eval


In [110]:
get_move_attributes_game(sample_unpacked_moves)

([0, 0, 0, 2, 1, 2, 0, 0, 0],
 [0, 1, 0, 2, 2, 5, 1, 0, 0],
 [0, 2, 0, 3, 2, 8, 2, 0, 0],
 [1, 3, 8, 5, 10, 12, 11, 0, 0],
 -1.62)

#### Applying to Pandas DF 

Unpack Moves

In [111]:
games_df['unpacked_moves'] = games_df.apply(lambda row: unpack_moves(row['pgn']), axis = 1)
games_df[['unpacked_moves']].head(3)

Unnamed: 0,unpacked_moves
0,"[(d4, 0.25), (d5, 0.25), (Bf4, 0.0), (Nf6, 0.0..."
1,"[(e4, 0.24), (g6, 0.46), (d4, 0.28), (Bg7, 0.3..."
2,"[(e4, 0.24), (e5, 0.37), (d4, -0.11), (exd4, 0..."


Get move attributes

In [112]:
games_df['game_attributes'] = games_df.apply(lambda row: get_move_attributes_game(row['unpacked_moves']), axis = 1)
games_df[['game_attributes']].head(3)

Unnamed: 0,game_attributes
0,"([0, 0, 0, 2, 1, 2, 1, 0, 0], [0, 4, 0, 2, 1, ..."
1,"([0, 0, 0, 0, 2, 3, 0, 0, 0], [0, 1, 0, 1, 3, ..."
2,"([0, 1, 0, 1, 0, 3, 0, 1, 0], [0, 1, 1, 3, 1, ..."


#### Featurized Dataframe

In [119]:
games_df['move5'], games_df['move10'], games_df['move15'], games_df['final'], games_df['opening_eval'] = zip(*games_df['game_attributes'])
games_df['move5_K'], games_df['move5_Q'], games_df['move5_R'], games_df['move5_B'], games_df['move5_N'], games_df['move5_P'], games_df['move5_captures'], games_df['move5_checks'], games_df['move5_pawn_density'] = zip(*games_df['move5'])
games_df['move10_K'], games_df['move10_Q'], games_df['move10_R'], games_df['move10_B'], games_df['move10_N'], games_df['move10_P'], games_df['move10_captures'], games_df['move10_checks'], games_df['move10_pawn_density'] = zip(*games_df['move10'])
games_df['move15_K'], games_df['move15_Q'], games_df['move15_R'], games_df['move15_B'], games_df['move15_N'], games_df['move15_P'], games_df['move15_captures'], games_df['move15_checks'], games_df['move15_pawn_density'] = zip(*games_df['move15'])

feature_df = games_df[['user_name', 'user_elo', 'opening_code', 'opening_name', 'opening_category', 'opening_eval', 'move5_K', 'move5_Q', 'move5_R', 'move5_B', 'move5_N', 'move5_P', 'move5_captures', 'move5_checks', 'move5_pawn_density', 'move10_K', 'move10_Q', 'move10_R', 'move10_B', 'move10_N', 'move10_P', 'move10_captures', 'move10_checks', 'move10_pawn_density', 'move15_K', 'move15_Q', 'move15_R', 'move15_B', 'move15_N', 'move15_P', 'move15_captures', 'move15_checks', 'move15_pawn_density']]
feature_df.head(5)

Unnamed: 0,user_name,user_elo,opening_code,opening_name,opening_category,opening_eval,move5_K,move5_Q,move5_R,move5_B,...,move10_pawn_density,move15_K,move15_Q,move15_R,move15_B,move15_N,move15_P,move15_captures,move15_checks,move15_pawn_density
0,bkrrhanife2,1772,D02,Queen's Pawn Game: London System,D,-3.98,0,0,0,2,...,2,0,0,0,0,0,0,0,0,0
1,Chessbullets,1828,B08,"Pirc Defense: Classical Variation, Schlechter ...",B,0.47,0,0,0,0,...,0,0,1,1,4,4,5,3,0,0
2,Tweaker,1407,C21,Danish Gambit,C,-7.72,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,monopunkt,1238,C46,Three Knights Opening #2,C,0.99,0,0,0,0,...,2,0,1,0,3,3,8,4,0,0
4,babar295,1064,C00,French Defense: Knight Variation,C,1.79,0,0,0,0,...,0,1,2,2,2,3,5,5,0,0


In [105]:
feature_df.columns

Index(['user_name', 'user_elo', 'opening_code', 'opening_name',
       'opening_category', 'opening_eval', 'move5_K', 'move5_Q', 'move5_R',
       'move5_B', 'move5_N', 'move5_P', 'move5_captures', 'move5_checks',
       'move5_pawn_density', 'move10_K', 'move10_Q', 'move10_R', 'move10_B',
       'move10_N', 'move10_P', 'move10_captures', 'move10_checks',
       'move10_pawn_density', 'move15_K', 'move15_Q', 'move15_R', 'move15_B',
       'move15_N', 'move15_P', 'move15_captures', 'move15_checks',
       'move15_pawn_density'],
      dtype='object')

## **TO DO**

- Define opening success 
- Build recommender system

#### Write to CSV 

In [104]:
feature_df.to_csv('../data/feature_df.csv', index = False)