In [None]:
import pandas as pd
import tqdm

def process_games(file_path, start_game=0, end_game=None, batch_size=500000):
    """
    Process games from a CSV file and expand the move sequences into individual rows,
    keeping a running sequence of all moves up to the current move. Adds a 'batch' column to indicate the batch number.
    Increments the count for duplicate positions across batches.

    :param file_path: Path to the CSV file.
    :param start_game: Index of the first game to process (0-indexed).
    :param end_game: Index of the last game to process (exclusive, None processes all).
    :param batch_size: The number of games per batch.
    """
    games = pd.read_csv(file_path)

    # Slice the DataFrame to only include the specified range of games
    games_to_process = games.iloc[start_game:end_game]
    
    # Use a dictionary to store position, move number, and count
    position_dict = {}

    for index, game in tqdm.tqdm(enumerate(games_to_process.iterrows(), start=start_game), total=games_to_process.shape[0]):
        _, game = game  # game is a tuple of (index, Series)

        # Check if 'moves' is not empty
        if not game['moves'] or pd.isna(game['moves']):
            continue  # Skip if moves are empty

        moves = eval(game['moves'])
        move_sequence = ''
        batch_number = (index // batch_size) + 1

        for move_number, move in enumerate(moves, start=1):
            move_sequence += move + ' '
            position = move_sequence.strip()

            # Use a composite key of position and move number
            key = (position, move_number)

            if key in position_dict:
                position_dict[key]['count'] += 1
            else:
                position_dict[key] = {'position': position, 'move_number': move_number, 'batch': batch_number, 'count': 1}

    # Convert the dictionary to a DataFrame
    ####################################################################
    #expanded_moves_df = pd.DataFrame.from_dict(position_dict, orient='index').reset_index(drop=True)
    # Initialize an empty DataFrame
    expanded_moves_df = pd.DataFrame()

    # Determine the chunk size
    chunk_size = len(position_dict) // 5  # Adjust this based on your needs

    # Collect keys in batches and delete them after processing each batch
    for chunk in range(0, len(position_dict), chunk_size):
        # Extract a chunk of keys
        chunk_keys = list(position_dict.keys())[chunk:chunk + chunk_size]

        # Convert chunk to DataFrame and append it
        chunk_dict = {key: position_dict[key] for key in chunk_keys}
        temp_df = pd.DataFrame.from_dict(chunk_dict, orient='index')
        expanded_moves_df = pd.concat([expanded_moves_df, temp_df], ignore_index=True)

        # Delete the processed keys from the original dictionary
        for key in chunk_keys:
            del position_dict[key]

        # Optional: Call garbage collector to free up memory
        import gc
        gc.collect()

    return expanded_moves_df


In [None]:


# Example usage
file_path = '../../testData/splitted_games/processed_moves2.csv'
processed_games_df = process_games(file_path, start_game=0, end_game=1000000, batch_size=1000000)


In [None]:
processed_games_df

In [None]:
processed_games_df.to_csv('../../testData/splitted_games/processed_games_moves.csv', index=False)


In [None]:
num_rows = processed_games_df.shape[0]
print(f"The number of rows in the DataFrame is: {num_rows}")


In [None]:
# Aggregate and count where move_number is 1, considering the batch
move_one_counts = processed_games_df[processed_games_df['move_number'] == 1].groupby(['batch', 'position']).size().reset_index(name='count')
# Aggregate and count occurrences for each unique position sequence at each move number
move_counts = processed_games_df.groupby(['move_number', 'batch', 'position']).size().reset_index(name='count')


In [None]:
import matplotlib.pyplot as plt

# Count unique positions per move sequence
unique_positions_per_move = processed_games_df.groupby('move_number')['position'].nunique()

# Get the total count of positions per move
total_positions_per_move = processed_games_df.groupby('move_number').size()

# Plot the data
plt.figure(figsize=(10, 6))

# Bar plot for unique positions
unique_positions_per_move.plot(kind='bar', alpha=0.5, label='Unique Positions')

# Line plot for total count of positions
total_positions_per_move.plot(kind='line', color='red', marker='o', label='Total Count of Positions')

plt.title('Unique Positions and Total Count of Positions per Move Sequence')
plt.xlabel('Move Number')
plt.ylabel('Count')
plt.xticks(rotation=0)  # Keep the x-axis labels readable
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
filtered_df_true = processed_games_df[(processed_games_df['move_number'] == 3) & (processed_games_df['is_unique'])].groupby('move_number').size()
filtered_df_false = processed_games_df[(processed_games_df['move_number'] == 3) & (~processed_games_df['is_unique'])].groupby('move_number').size()
print(filtered_df)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Count the number of unique and duplicate positions per move sequence
count_df = processed_games_df.groupby(['move_number', 'is_unique']).size().unstack(fill_value=0)
count_df.columns = ['Unique', 'Not Unique']  # Rename columns for clarity, ensuring correct order

# Create the plot
plt.figure(figsize=(12, 6))
ax = count_df.plot(kind='bar', stacked=True, color=['skyblue', 'salmon'], figsize=(12, 6))  # Adjust colors to match labels
plt.title('Count of Unique and Not Unique Positions per Move Sequence')
plt.xlabel('Move Number')
plt.ylabel('Count')
plt.xticks(rotation=0)  # Keep the x-axis labels readable
plt.legend(loc='upper right')

# Annotate bars with the count of unique positions
# Since 'Unique' is now the first column, we iterate over the first half of the patches
for idx, rect in enumerate(ax.patches[:len(ax.patches) // 2]):  # Iterate over the unique position bars
    height = rect.get_height()
    count = int(count_df.iloc[idx]['Unique'])  # Access the 'Unique' count directly
    if height > 0:  # Only annotate non-zero bars
        ax.text(rect.get_x() + rect.get_width() / 2, height + 20, f'{count}', ha='center', va='bottom', color='black', fontsize=9)

plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Filter out 'not unique' positions before plotting
unique_counts = counts[counts.index.get_level_values('is_unique')]

# Plot the data
ax = unique_counts.plot(kind='bar', figsize=(12, 6), color='green')
plt.title('Count of Unique Positions per Move Number')
plt.xlabel('Move Number')
plt.ylabel('Count')
plt.legend(['Unique'], title='Position Uniqueness')
plt.tight_layout()

# Annotate the bars with the absolute amount (only for unique positions)
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy()
    if height > 0:  # Only annotate non-zero bars
        ax.annotate(f'{int(height)}', (x + width/2, y + height/2), ha='center', va='center')

plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Create a column to indicate whether the position is unique (count == 1) or duplicate (count > 1)
processed_games_df['is_unique'] = processed_games_df['count'] == 1

# Group by move_number and calculate the percentage of unique and duplicate positions
percentage_df = processed_games_df.groupby('move_number')['is_unique'].value_counts(normalize=True).unstack().fillna(0) * 100
percentage_df.columns = ['Duplicate Positions', 'Unique Positions']  # Rename columns for clarity

# Create the plot
plt.figure(figsize=(12, 6))
ax = percentage_df.plot(kind='bar', stacked=True, color=['salmon', 'skyblue'], figsize=(12, 6))
plt.title('Percentage of Unique and Duplicate Positions per Move Sequence')
plt.xlabel('Move Number')
plt.ylabel('Percentage')
plt.xticks(rotation=0)  # Keep the x-axis labels readable
plt.legend(loc='upper right')

# Annotate bars with the percentage of duplicate positions
for i, rect in enumerate(ax.patches[:len(ax.patches)//2]):  # Iterate over the duplicate position bars
    height = rect.get_height()
    percentage = percentage_df.iloc[i // 2]['Duplicate Positions'] if height > 0 else 0  # Avoid annotating zero-height bars
    ax.text(rect.get_x() + rect.get_width() / 2.0, 
            height / 2, 
            f'{percentage:.1f}%', 
            ha='center', 
            va='center', 
            color='black',
            fontsize=9)

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming processed_games_df contains 'move_number', 'batch', and 'count' columns

# Group by move_number and batch, then sum the counts
counts_per_move_batch = processed_games_df.groupby(['move_number', 'batch'])['count'].sum().reset_index()

# Pivot the data for plotting
pivot_df = counts_per_move_batch.pivot(index='move_number', columns='batch', values='count').fillna(0)

# Plotting
plt.figure(figsize=(12, 6))
pivot_df.plot(kind='bar', stacked=True, ax=plt.gca())
plt.title('Count of Positions per Move Number by Batch')
plt.xlabel('Move Number')
plt.ylabel('Count')
plt.legend(title='Batch', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd

# Group by move_number and batch, then sum the counts
counts_per_move_batch = processed_games_df.groupby(['move_number', 'batch'])['count'].sum().reset_index()

# Pivot the data for tabular representation
pivot_df = counts_per_move_batch.pivot(index='move_number', columns='batch', values='count').fillna(0)

# Calculate the total counts per move_number for percentage calculation
total_counts_per_move = pivot_df.sum(axis=1)

# Calculate percentages
percentage_df = pivot_df.div(total_counts_per_move, axis=0) * 100

# Display the percentage DataFrame
print(percentage_df)


In [None]:
import matplotlib.pyplot as plt

# Plotting the line diagram
plt.figure(figsize=(12, 6))
for batch in percentage_df.columns:
    plt.plot(percentage_df.index, percentage_df[batch], marker='o', label=f'Batch {batch}')

plt.title('Percentage Contribution of Each Batch per Move Number')
plt.xlabel('Move Number')
plt.ylabel('Percentage Contribution')
plt.legend(title='Batch', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()


# Compare Player Data with Index

In [None]:


# Example usage
file_path = '../../testData/splitted_games/processed_moves_apendra.csv'
processed_games_apendra_df = process_games(file_path, start_game=0, end_game=2000, batch_size=2000)


In [None]:
new_moves_df = processed_games_apendra_df


In [None]:
new_moves_df

In [None]:
processed_games_df.set_index('position', inplace=True)
new_moves_df.set_index('position', inplace=True)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure 'position' is set as the index for both DataFrames

# Using the index to check if the position exists in processed_games_df
new_moves_df['found'] = new_moves_df.index.isin(processed_games_df.index)

# Calculate the percentage of moves found and not found
percentage_found = new_moves_df['found'].mean() * 100  # mean() is equivalent to summing True values and dividing by total length
percentage_not_found = 100 - percentage_found

# Output results
print(f"Percentage found: {percentage_found:.2f}%")
print(f"Percentage not found: {percentage_not_found:.2f}%")

# For visualization
data = {'Found': percentage_found, 'Not Found': percentage_not_found}
fig, ax = plt.subplots()
ax.bar(data.keys(), data.values(), color=['green', 'red'])
plt.title('Percentage of New Moves Found in Processed Data')
plt.ylabel('Percentage')
plt.show()


In [None]:
# Assuming new_moves_df has the columns 'found' and 'move_number'
new_moves_df = new_moves_df[~((new_moves_df['move_number'] == 1) & (new_moves_df['found'] == False))]


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Let's assume new_moves_df has a 'move_number' column to indicate the move number
# If not, you would need to create this column based on your data's structure

# Group by move_number and calculate the mean 'found' value for each group
move_found_percentage = new_moves_df.groupby('move_number')['found'].mean() * 100
move_not_found_percentage = 100 - move_found_percentage

# Prepare the data for stacking in the plot
stacked_data = pd.DataFrame({'Found': move_found_percentage, 'Not Found': move_not_found_percentage})

# Plotting
plt.figure(figsize=(12, 6))
stacked_data.plot(kind='bar', stacked=True, color=['green', 'red'], figsize=(12, 6))
plt.title('Percentage of Moves Found in Processed Data by Move Number')
plt.xlabel('Move Number')
plt.ylabel('Percentage')
plt.legend(title='Status')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Create a column to indicate whether the position is unique (count == 1) or duplicate (count > 1)
new_moves_df['is_unique'] = new_moves_df['count'] == 1

# Group by move_number and calculate the percentage of unique and duplicate positions
new_percentage_df = new_moves_df.groupby('move_number')['is_unique'].value_counts(normalize=True).unstack().fillna(0) * 100
new_percentage_df.columns = ['Duplicate Positions', 'Unique Positions']  # Rename columns for clarity

# Create the plot
plt.figure(figsize=(12, 6))
ax = new_percentage_df.plot(kind='bar', stacked=True, color=['salmon', 'skyblue'], figsize=(12, 6))
plt.title('Percentage of Unique and Duplicate Positions per Move Sequence')
plt.xlabel('Move Number')
plt.ylabel('Percentage')
plt.xticks(rotation=0)  # Keep the x-axis labels readable
plt.legend(loc='upper right')

# Annotate bars with the percentage of duplicate positions
for i, rect in enumerate(ax.patches[:len(ax.patches)//2]):  # Iterate over the duplicate position bars
    height = rect.get_height()
    percentage = new_percentage_df.iloc[i // 2]['Duplicate Positions'] if height > 0 else 0  # Avoid annotating zero-height bars
    ax.text(rect.get_x() + rect.get_width() / 2.0, 
            height / 2, 
            f'{percentage:.1f}%', 
            ha='center', 
            va='center', 
            color='black',
            fontsize=9)

plt.tight_layout()
plt.show()


In [None]:
new_moves_df[new_moves_df['found']]

In [None]:

# Filter the DataFrame to include only rows where 'found' is True
count_df = new_moves_df[~new_moves_df['found']]


# Group by 'move_number' and count 'is_unique' values
count_df = count_df.groupby('move_number')['is_unique'].value_counts().unstack().fillna(0)
count_df

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Count the number of unique and duplicate positions per move sequence
#count_df = new_moves_df.groupby('move_number')['is_unique'].value_counts().unstack().fillna(0)
count_df.columns = ['Duplicate Positions', 'Unique Positions']  # Rename columns for clarity

# Create the plot
plt.figure(figsize=(12, 6))
ax = count_df.plot(kind='bar', stacked=True, color=['salmon', 'skyblue'], figsize=(12, 6))
plt.title('Count of Unique and Duplicate Positions per Move Sequence')
plt.xlabel('Move Number')
plt.ylabel('Count')
plt.xticks(rotation=0)  # Keep the x-axis labels readable
plt.legend(loc='upper right')

# Annotate bars with the count of duplicate positions
for i, rect in enumerate(ax.patches[:len(ax.patches)//2]):  # Iterate over the duplicate position bars
    height = rect.get_height()
    count = count_df.iloc[i // 2]['Duplicate Positions'] if height > 0 else 0  # Avoid annotating zero-height bars
    ax.text(rect.get_x() + rect.get_width() / 2.0, 
            height + 20,  # Adjust the position for better visibility
            f'{int(count)}', 
            ha='center', 
            va='bottom', 
            color='black',
            fontsize=9)

plt.tight_layout()
plt.show()
