In [1]:
import chess
import pandas as pd
import json
import matplotlib.pyplot as plt
from stockfish import Stockfish
from concurrent.futures import ProcessPoolExecutor, as_completed
import numpy as np
import joblib

def load_evaluations_cache():
    file_path = "../../../data/indexed_positions/final_processed_index.json"
    with open(file_path, 'r') as file:
        cache = json.load(file)
    return cache

def initialize_stockfish(path, depth, skill_level):
    stockfish = Stockfish(path)
    stockfish.set_depth(depth)
    stockfish.set_skill_level(skill_level)
    return stockfish

def build_stored_game_analysis(game, move_number, stockfish, evaluations_cache):
    row = {}
    row['move_number'] = move_number
    board = chess.Board()

    for san in game.moves[:move_number]:
        move = board.parse_san(san)
        board.push(move)

    fen = board.fen()
    row['fen'] = fen
    row['cached'] = fen in evaluations_cache

    if fen in evaluations_cache:
        evaluation_value = evaluations_cache[fen]
    else:
        stockfish.set_fen_position(fen)
        evaluation = stockfish.get_evaluation()
        evaluation_value = evaluation['value']

    row['evaluation'] = evaluation_value

    return row

def analyze_games(chunk, stockfish_path, depth, skill_level):
    stockfish = initialize_stockfish(stockfish_path, depth, skill_level)
    evaluations_cache = load_evaluations_cache()

    all_game_analysis = []
    for game in chunk.itertuples(index=False):
        game_analysis = []
        move_number = 1

        while move_number <= len(game.moves):
            analysis_result = build_stored_game_analysis(game, move_number, stockfish, evaluations_cache)
            game_analysis.append(analysis_result)
            move_number += 1

        df = pd.DataFrame(game_analysis)
        df.set_index('move_number', inplace=True)
        all_game_analysis.append(df)

    return all_game_analysis

def parallel_game_analysis(games, stockfish_path, depth, skill_level, workers):
    chunk_size = len(games) // workers
    game_chunks = [games.iloc[i:i + chunk_size] for i in range(0, len(games), chunk_size)]
    args = [(chunk, stockfish_path, depth, skill_level) for chunk in game_chunks]

    results = []
    with ProcessPoolExecutor(max_workers=workers) as executor:
        futures = [executor.submit(analyze_games, *arg) for arg in args]
        for future in as_completed(futures):
            results.extend(future.result())
    return results

def get_analyzed_games(games):
    n_games = 100 # len(games)
    skill_level = 1
    depth = 1
    num_threads = 2  # Adjust the number of threads as necessary
    stockfish_path = "C:/Users/aober/Documents/Data_Science_Studium/4Semester/BigData/stockfish/stockfish-windows-x86-64-avx2.exe"

    games = games[:n_games]
    games.loc[:, 'moves'] = games['moves'].apply(lambda x: x.split() if isinstance(x, str) else x)

    results = parallel_game_analysis(games, stockfish_path, depth, skill_level, num_threads)
    return results

def plot_cached_positions_distribution(all_game_analysis):
    move_numbers = []
    cached_counts = []
    not_cached_counts = []

    for game_df in all_game_analysis:
        cached_positions = game_df['cached'].value_counts().get(True, 0)
        not_cached_positions = game_df['cached'].value_counts().get(False, 0)
        move_numbers.extend(game_df.index)

        cached_counts.extend([1 if cached else 0 for cached in game_df['cached']])
        not_cached_counts.extend([0 if cached else 1 for cached in game_df['cached']])

    df = pd.DataFrame({
        'move_number': move_numbers,
        'cached': cached_counts,
        'not_cached': not_cached_counts
    })

    # Aggregate counts by move number
    aggregated_df = df.groupby('move_number').sum()

    # Plotting the stacked bar chart
    plt.figure(figsize=(15, 6))
    aggregated_df.plot(kind='bar', stacked=True, color=['#003f5c', '#ffa600'], alpha=0.75, edgecolor='black')
    plt.title('Cached vs. Not Cached Positions by Move Number')
    plt.xlabel('Move Number')
    plt.ylabel('Count of Positions')
    plt.grid(True)
    plt.show()

if __name__ == "__main__":
    file_name = "../../data/pipeline_test/processed_games.json"
    with open(file_name, 'r') as file:
        games = json.load(file)

    games = pd.DataFrame(games)
    analyzed_games = get_analyzed_games(games)

    plot_cached_positions_distribution(analyzed_games)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games.loc[:, 'moves'] = games['moves'].apply(lambda x: x.split() if isinstance(x, str) else x)


BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.

In [3]:

    # Aggregate counts by move number
    aggregated_df = df.groupby('move_number').sum()

    # Plotting the stacked bar chart
    plt.figure(figsize=(15, 6))
    aggregated_df.plot(kind='bar', stacked=True, color=['#003f5c', '#ffa600'], alpha=0.75, edgecolor='black')
    plt.title('Cached vs. Not Cached Positions by Move Number')
    plt.xlabel('Move Number')
    plt.ylabel('Count of Positions')
    plt.grid(True)
    plt.show()



NameError: name 'df' is not defined