Place this in the data/ folder in chess_llm_interpretability to perform various uniqueness checks on datasets

In [None]:
import pandas as pd

# Function to process chunks of the CSV file
def process_chunks(chunk_iter):
    transcripts = set()  # Initialize an empty set for transcripts
    for chunk in chunk_iter:
        # Update the set with transcripts from the current chunk
        transcripts.update(chunk['transcript'])
    return transcripts

# Specify the path to your CSV file
file_path = 'lichess_6gb.csv'

# Create a chunk iterator with a reasonable chunk size
chunk_size = 10**5  # Adjust this based on your system's performance and memory usage

# Create an iterator object for chunks of the DataFrame
chunk_iter = pd.read_csv(file_path, chunksize=chunk_size, usecols=['transcript'])

# Process the chunks and get the set of transcripts
transcripts_set = process_chunks(chunk_iter)

print(f"Total unique transcripts: {len(transcripts_set)}")


In [None]:
testing_file_path = 'lichess_100mb.csv'
df = pd.read_csv(testing_file_path, usecols=['transcript'])
print(f"Total rows in the DataFrame: {len(df)}")

# Initialize a counter for overlaps
overlap_count = 0

# Process each transcript in the DataFrame
for transcript in df['transcript']:
    # Check if the transcript is already in the set
    if transcript in transcripts_set:
        overlap_count += 1
    else:
        # Add the new transcript to the set
        transcripts_set.add(transcript)

print(f"Total unique transcripts now: {len(transcripts_set)}")
print(f"Number of overlaps found: {overlap_count}")

In [None]:
import csv

# Initialize counters
total_rows = 0
total_characters = 0

# Specify the path to your CSV file
file_path = 'lichess_6gb.csv'

# Open the file and use csv.reader to handle potential complexities in the CSV format
with open(file_path, 'r', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    # Skip the header
    next(reader)
    for row in reader:
        total_rows += 1
        # Assuming transcript is the last column
        transcript = row[-1]
        total_characters += len(transcript)

print(f"Total number of rows: {total_rows}")
print(f"Total number of characters in transcripts: {total_characters}")


In [None]:
import pandas as pd

file_path = 'lichess_6gb.csv'
lichess_df = pd.read_csv(file_path)

In [None]:
llm_file_path = '8layer_llm_games.csv'
llm_df = pd.read_csv(llm_file_path)
llm_df = llm_df[:100]
total_games = len(df)
print(f"Total number of games: {total_games}")
print(f"Total rows in the LLM DataFrame: {len(llm_df)}")

In [None]:
import re

def unique_transcripts_by_move_df(move_number: int, lichess_df: pd.DataFrame, llm_df: pd.DataFrame) -> int:
    """
    Find how many games in `llm_df` are unique by move number compared to games in `lichess_df`.
    
    Parameters:
    - move_number: The move number to compare uniqueness by.
    - lichess_df: DataFrame containing the Lichess game transcripts.
    - llm_df: DataFrame containing the LLM game transcripts.
    
    Returns:
    - The number of unique games in `llm_df` by move number.
    """
    lichess_set = set()
    
    # Process lichess_df to extract unique transcripts by move number
    for i, transcript in enumerate(lichess_df['transcript']):
        shortened_transcript = " ".join(transcript.split(' ', move_number)[:move_number])
        lichess_set.add(shortened_transcript)
    
    unique_count = 0
    
    # Process llm_df to find unique transcripts by move number
    for i, transcript in enumerate(llm_df['transcript']):
        transcript = transcript.split("\n\n")[1].strip()
        transcript = re.sub(r"(\d+\.) ", r"\1", transcript)
        shortened_transcript = " ".join(transcript.split(' ', move_number)[:move_number])
        if shortened_transcript not in lichess_set:
            unique_count += 1
    
    return unique_count


In [None]:
move_number = 20
unique_games_count = unique_transcripts_by_move_df(move_number, lichess_df, llm_df)
print(f"Unique games by move {move_number}: {unique_games_count}")