In [1]:
import chess.pgn
import csv
import pandas as pd
import os
import io

In [2]:
def extract_combined_moves_and_comments(pgn_text):
    # Convert the string to a StringIO object
    pgn = io.StringIO(pgn_text)
    
    # Read the game
    game = chess.pgn.read_game(pgn)
    
    combined_moves_comments = []
    move_number = 1
    
    node = game
    while not node.is_end():
        next_node = node.variation(0)
        board = str(node.board())
        
        move = node.board().uci(next_node.move)
        comment = next_node.comment
        
        player = "White" if move_number % 2 == 1 else "Black"
        combined_moves_comments.append((move_number, player, move,board,  comment))
        
        node = next_node
        move_number += 1
    
    return combined_moves_comments


In [3]:
def read_pgn_and_write_to_csv(pgn_file_path, csv_file_path,game_number=1):
    with open(pgn_file_path, 'r', encoding='utf-8') as pgn:
        with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
            csvwriter = csv.writer(csvfile)
            # Write the headers
            csvwriter.writerow(['Game Number', 'Move Number', 'Player', 'Move',"Board", 'Comment'])
            
            
            while True:
                game = chess.pgn.read_game(pgn)
                if game is None:
                    break  # End of file or no more games
                
                combined_moves_comments = extract_combined_moves_and_comments(str(game))
                
                # Write each move's data to the CSV, prepending the game number
                for move_number, player, move,board, comment in combined_moves_comments:
                    csvwriter.writerow([game_number, move_number, player, move, board, comment])
                
                game_number += 1
    return game_number

In [4]:
def process_pgn_files(directory):
    game_number = 1

    for filename in os.listdir(directory):
        if filename.endswith('.pgn'):
            pgn_file_path = os.path.join(directory, filename)
            csv_file_path = os.path.join(directory, os.path.splitext(filename)[0] + '.csv')
            print(filename)

            # Now read the PGN file and write to the corresponding CSV file
            game_number = read_pgn_and_write_to_csv(pgn_file_path, csv_file_path,game_number)
            print(f"Processed {filename} to {os.path.splitext(filename)[0] + '.csv'}")

In [5]:
directory = './'
process_pgn_files(directory)

linares_2002.pgn
Processed linares_2002.pgn to linares_2002.csv
middleg.pgn
Processed middleg.pgn to middleg.csv
russian_chess.pgn
Processed russian_chess.pgn to russian_chess.csv
hartwig.pgn
Processed hartwig.pgn to hartwig.csv
newyork1924.pgn
Processed newyork1924.pgn to newyork1924.csv


In [7]:
#Combine all csv files into one
all_filenames = [i for i in os.listdir(directory) if i.endswith('.csv')]
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
combined_csv.to_csv( "combined_csv.csv", index=False, encoding='utf-8-sig')

In [6]:
combined = pd.read_csv('combined.csv')

In [13]:
combined.head()

Unnamed: 0,Game Number,Move Number,Player,Move,Board,Comment
0,576,1,White,b2b4,r n b q k b n r\np p p p p p p p\n. . . . . . ...,"WHAT?!?! In the true hypermodern style, Tartak..."
1,576,2,Black,e7e6,r n b q k b n r\np p p p p p p p\n. . . . . . ...,Maroczy (Black) makes an illogical move of his...
2,576,3,White,c1b2,r n b q k b n r\np p p p . p p p\n. . . . p . ...,Tartakower doesn't need to defend the pawn yet...
3,576,4,Black,g8f6,r n b q k b n r\np p p p . p p p\n. . . . p . ...,"Maroczy follows more conventional lines, devel..."
4,576,5,White,b4b5,r n b q k b . r\np p p p . p p p\n. . . . p n ...,Tartakower inhibits the development of Black's...


In [12]:
temp = combined.iloc[0]

In [16]:
chess.Board(temp['Board'])

ValueError: expected 'w' or 'b' for turn part of fen: 'r n b q k b n r\np p p p p p p p\n. . . . . . . .\n. . . . . . . .\n. . . . . . . .\n. . . . . . . .\nP P P P P P P P\nR N B Q K B N R'

In [11]:
#Check percentage of NaN in Comment column
print(combined['Comment'].isna().sum() / len(combined) * 100)

81.60295880834806


In [7]:
#read as df limit 1000
df = pd.read_csv('combined.csv')

In [9]:
print(df['Comment'].isna().sum() / len(df) * 100)

81.60295880834806


In [None]:
df['Comment'] = df['Comment'].fillna('This move is not enough for a comment')

In [10]:
print(df['Comment'].isna().sum() / len(df) * 100)

0.0


In [11]:
#save to csv
df.to_csv('combined.csv', index=False)