In [146]:
import chess.pgn
import pandas as pd
pd.options.display.max_columns=999
import datetime
import tqdm
import zipfile

# Process PGN into Python Friendlier Form

In [147]:
file_name = "apendra_games"
player_name = "Apendra"

In [148]:
NUM_GAMES=1800  #from https://database.nikonoel.fr/ Not necessary but helpful to get tqdm to give estimate
rows=[]
with open(f'../../testData/lichess_db_standard_rated_2013-04/{file_name}.pgn') as pgn:
    for game in tqdm.tqdm(range(NUM_GAMES)):
        row={}
        game = chess.pgn.read_game(pgn)
        row['headers']=game.headers.__dict__
        row['moves']=[x.uci() for x in game.mainline_moves()]
        rows.append(row)
games=pd.DataFrame(rows)

100%|██████████| 1800/1800 [00:05<00:00, 343.58it/s]


In [107]:
#komplett unnötig
def split_chess_moves(moves_list):
    pattern = r'\s*(\d{1,3})\.?\s*((?:(?:O-O(?:-O)?)|(?:[KQNBR][1-8a-h]?x?[a-h]x?[1-8])|(?:[a-h]x?[a-h]?[1-8]\=?[QRNB]?))\+?)(?:\s*\d+\.?\d+?m?s)?\.?\s*((?:(?:O-O(?:-O)?)|(?:[KQNBR][1-8a-h]?x?[a-h]x?[1-8])|(?:[a-h]x?[a-h]?[1-8]\=?[QRNB]?))\+?)?(?:\s*\d+\.?\d+?m?s)?'
    
    processed_moves = []
    for move in moves_list:
        # Find all matches in the move string
        matches = re.findall(pattern, move)
        # Extract moves, ignoring move numbers and timings
        for match in matches:
            processed_moves.extend([m for m in match[1:] if m])
    
    return processed_moves

# Apply the function to each list in the 'moves' column
games['moves'] = games['moves'].apply(split_chess_moves)

Loading a big PGN file is a little slow. There are more advanced coding architectures meant to handle that since sometimes the data creation rate is faster than you can process on one machine. Essentially you would do stream processing with scalable worker nodes. Another optimation would be multi-threading since there is a lot of I/O (input output) but multi-processing may work well too. Since 20 minutes isn't the end of the world we will just wait for now.  

In [149]:
games['moves'] = games['moves'].apply(lambda x: x[:30])

In [150]:
#Save as CSV for easy, faster access
games.to_csv(f"../../testData/{file_name}.csv", index=False)


# Game Analysis
As mentioned in the intro we want to run through these games to add extra analytics such as: 
- position complexity (count blunders by player, average complexity)
- openings used
- win rates by Elo rating and openings

In [184]:
games=pd.read_csv(f"../../testData/{file_name}.csv")
games

Unnamed: 0,headers,moves
0,"{'_tag_roster': {'Event': 'Rated Bullet game',...","['e2e4', 'e7e5', 'g1f3', 'b8c6', 'f1b5', 'a7a6..."
1,"{'_tag_roster': {'Event': 'Rated Bullet game',...","['c2c4', 'e7e5', 'c4c5', 'f8c5', 'g2g3', 'g8f6..."
2,"{'_tag_roster': {'Event': 'Rated Bullet game',...","['e2e4', 'd7d5', 'e4d5', 'd8d5', 'b1c3', 'd5d8..."
3,"{'_tag_roster': {'Event': 'Rated Bullet game',...","['e2e4', 'e7e5', 'g1f3', 'b8c6', 'f1c4', 'f8c5..."
4,"{'_tag_roster': {'Event': 'Rated Bullet game',...","['e2e4', 'c7c5', 'g1f3', 'e7e6', 'c2c3', 'd7d5..."
...,...,...
1795,"{'_tag_roster': {'Event': 'Rated Bullet game',...","['e2e4', 'e7e5', 'f2f4', 'b8c6', 'g1f3', 'e5f4..."
1796,"{'_tag_roster': {'Event': 'Rated Bullet game',...","['d2d4', 'd7d5', 'c2c4', 'd5c4', 'b1c3', 'g8f6..."
1797,"{'_tag_roster': {'Event': 'Rated Bullet game',...","['e2e4', 'c7c5', 'g1f3', 'd7d6', 'b1c3', 'b8c6..."
1798,"{'_tag_roster': {'Event': 'Rated Bullet game',...","['e2e4', 'e7e6', 'g1f3', 'a7a6', 'd2d4', 'h7h6..."


In [185]:
#Convert moves from string into list of move strings 

#Try not to run eval unless you really need to and are sure the text comes from a trusted source
games['moves']=games['moves'].apply(lambda x: eval(x))
games['headers']=games['headers'].apply(lambda x: eval(x))#to dict

## Grab Interesting Variables
We're going to parse out specific data points from headers and moves. Anything interesting we will store as a top-level column on the dataframe so when we ultimately store this data it's ready to go for analysis.

If you just want everything from headers you can use pd.json_normalize 

In [186]:
def safe_convert_to_int(value):
    try:
        return int(value)
    except ValueError:
        return None  # or you can use np.nan or a placeholder like -1

In [187]:
# Assuming player_name contains the name of the player you're interested in
player_name = 'Apendra'

def find_player_color(header):
    if header.get("_tag_roster", {}).get("White", "") == player_name:
        return 'White'
    elif header.get("_tag_roster", {}).get("Black", "") == player_name:
        return 'Black'
    return 'Unknown'  # In case the player is not found in either

def player_score(header, player_color):
    result = header.get('_tag_roster', {}).get('Result', '')
    if result == "1/2-1/2":
        return 0.5  # Draw
    elif result == "1-0" and player_color == 'White':
        return 1.0  # White win
    elif result == "0-1" and player_color == 'Black':
        return 1.0  # Black win
    return 0.0  # Loss or unknown result



In [144]:
games['headers'][0]

{'_tag_roster': {'Event': 'Rated Bullet game',
  'Site': 'https://lichess.org/R5xvWQmB',
  'Date': '2024.03.14',
  'Round': '?',
  'White': 'Amir_122012',
  'Black': 'Apendra',
  'Result': '1-0'},
 '_others': {'UTCDate': '2024.03.14',
  'UTCTime': '10:05:06',
  'WhiteElo': '1393',
  'BlackElo': '1490',
  'WhiteRatingDiff': '+7',
  'BlackRatingDiff': '-7',
  'Variant': 'Standard',
  'TimeControl': '60+0',
  'ECO': 'C68',
  'Opening': 'Ruy Lopez: Exchange Variation, Keres Variation',
  'Termination': 'Time forfeit'}}

In [193]:
games['lichess_id'] = games['headers'].apply(lambda x: x.get("_tag_roster", {}).get("Site", ""))

games['white_elo'] = games['headers'].apply(lambda x: safe_convert_to_int(x.get("_others", {}).get("WhiteElo", "").split("-")[0]))
games['black_elo'] = games['headers'].apply(lambda x: safe_convert_to_int(x.get("_others", {}).get("BlackElo", "").split("-")[0]))
games['ECO'] = games['headers'].apply(lambda x: x.get("_others", {}).get("ECO"))
games['opening_name'] = games['headers'].apply(lambda x: x.get("_others", {}).get("Opening", ""))
games['event'] = games['headers'].apply(lambda x: x.get("_tag_roster", {}).get("Event", ""))

games['player_color'] = games['headers'].apply(find_player_color)
games['score'] = games.apply(lambda x: player_score(x['headers'], x['player_color']), axis=1)

games.head()


Unnamed: 0,headers,moves,lichess_id,white_elo,black_elo,ECO,opening_name,event,player_color,score
0,"{'_tag_roster': {'Event': 'Rated Bullet game',...","[e2e4, e7e5, g1f3, b8c6, f1b5, a7a6, b5c6, d7c...",https://lichess.org/R5xvWQmB,1393.0,1490.0,C68,"Ruy Lopez: Exchange Variation, Keres Variation",Rated Bullet game,Black,0.0
1,"{'_tag_roster': {'Event': 'Rated Bullet game',...","[c2c4, e7e5, c4c5, f8c5, g2g3, g8f6, f1g2, d7d...",https://lichess.org/W4qIfyPj,1468.0,1485.0,A20,English Opening: King's English Variation,Rated Bullet game,Black,1.0
2,"{'_tag_roster': {'Event': 'Rated Bullet game',...","[e2e4, d7d5, e4d5, d8d5, b1c3, d5d8, g1f3, g7g...",https://lichess.org/uAyyykrW,1490.0,1499.0,B01,Scandinavian Defense: Valencian Variation,Rated Bullet game,White,0.0
3,"{'_tag_roster': {'Event': 'Rated Bullet game',...","[e2e4, e7e5, g1f3, b8c6, f1c4, f8c5, b2b4, c5b...",https://lichess.org/vBCaaIq2,1549.0,1484.0,C52,"Italian Game: Evans Gambit, Pierce Defense",Rated Bullet game,Black,1.0
4,"{'_tag_roster': {'Event': 'Rated Bullet game',...","[e2e4, c7c5, g1f3, e7e6, c2c3, d7d5, e4d5, e6d...",https://lichess.org/AAfUu8jz,1477.0,1514.0,B40,"Sicilian Defense: Delayed Alapin Variation, wi...",Rated Bullet game,White,1.0


Data looks mostly good but I noticed "Grünfeld Defense: Exchange Variation" showing up funky. We can use ftfy to fix this.

In [189]:
#If you get an error on this import be sure to run pip install ftfy==6.1.1. This was added to requirements.txt recently
import ftfy
ftfy.fix_encoding("GrÃ¼nfeld Defense: Exchange Variation")

'Grünfeld Defense: Exchange Variation'

In [26]:
#Can run shell commands directly in Jupyter using !. Run below if had an import error above
!pip install ftfy==6.1.1

^C


In [190]:
games['opening_name']=games['opening_name'].apply(ftfy.fix_encoding)
games['opening_name']

0          Ruy Lopez: Exchange Variation, Keres Variation
1               English Opening: King's English Variation
2               Scandinavian Defense: Valencian Variation
3              Italian Game: Evans Gambit, Pierce Defense
4       Sicilian Defense: Delayed Alapin Variation, wi...
                              ...                        
1795              King's Gambit Accepted: MacLeod Defense
1796                              Queen's Gambit Accepted
1797                  Sicilian Defense: Modern Variations
1798                     French Defense: Knight Variation
1799                            Ruy Lopez: Berlin Defense
Name: opening_name, Length: 1800, dtype: object

Opening name looks good now!

In [192]:
#Ensure score parsed correctly
games['score'].value_counts()

1.0    894
0.0    852
0.5     54
Name: score, dtype: int64

In [194]:
games.to_csv(f"../../testData/{file_name}.csv", index=False)
