In [1]:
import pandas as pd
import pyarrow.parquet as pq
from collections import Counter
import ast
import glob


In [2]:
parquet_files = sorted(glob.glob("*.parquet"))  # adjust path


In [3]:
# Card statistics
all_winner_cards = []
all_cards = []

# Deck statistics
deck_wins = Counter()
deck_losses = Counter()


In [None]:
for file in parquet_files:
    print(f"Processing {file}...")
    df = pd.read_parquet(file)

    df.drop(columns=['Unnamed: 0', 'battleTime', 'gameMode.id', 
                 'winner.clan.tag', 'winner.clan.badgeId',
                 'loser.clan.tag', 'loser.clan.badgeId', 'tournamentTag'], inplace=True)

    df = df.drop_duplicates()
    # -----------------------------
    # Clean tower HP nulls
    # -----------------------------
    tower_cols = [
        'winner.princessTowersHitPoints',
        'loser.kingTowerHitPoints',
        'loser.princessTowersHitPoints'
    ]
    for col in tower_cols:
        df[col] = df[col].fillna(0)
    
    # -----------------------------
    # Ensure cards list is evaluated
    # -----------------------------
    df['winner.cards.list'] = df['winner.cards.list'].apply(ast.literal_eval)
    df['loser.cards.list'] = df['loser.cards.list'].apply(ast.literal_eval)
    
    # -----------------------------
    # Flatten cards for card impact/frequency
    # -----------------------------
    # Flatten cards for card impact/frequency
    winner_cards = [card for deck in df['winner.cards.list'] for card in deck]
    loser_cards  = [card for deck in df['loser.cards.list']  for card in deck]
    
    # Extend the overall lists
    all_winner_cards.extend(winner_cards)       # For card impact (wins / total)
    all_cards.extend(winner_cards + loser_cards)  # For total frequency (all cards)

    
    # -----------------------------
    # Deck-level statistics
    # -----------------------------
    df['winner_deck'] = df['winner.cards.list'].apply(lambda x: tuple(sorted(x)))
    df['loser_deck']  = df['loser.cards.list'].apply(lambda x: tuple(sorted(x)))
    
    deck_wins.update(df['winner_deck'])
    deck_losses.update(df['loser_deck'])


Processing clash_chunk_0.parquet...


In [None]:
import pickle

with open("clash_full_data.pkl", "wb") as f:
    pickle.dump({
        # Raw processed data
        "all_cards": all_cards,
        "all_winner_cards": all_winner_cards,
        "deck_wins": deck_wins,
        "deck_losses": deck_losses,
        # Computed stats
        "deck_stats": deck_stats,
        "card_impact_df": card_impact_df,
        "freq_df": freq_df
    }, f)

print("All data saved to clash_full_data.pkl")


In [None]:
import pickle

with open("clash_full_data.pkl", "rb") as f:
    data = pickle.load(f)

# Restore everything
all_cards = data["all_cards"]
all_winner_cards = data["all_winner_cards"]
deck_wins = data["deck_wins"]
deck_losses = data["deck_losses"]
deck_stats = data["deck_stats"]
card_impact_df = data["card_impact_df"]
freq_df = data["freq_df"]

print("Data loaded successfully!")


In [None]:
all_winner_cards

In [None]:
from collections import Counter
import pandas as pd

# Count occurrences only once
winner_counts = Counter(all_winner_cards)
total_counts  = Counter(all_cards)

# Compute impact ratio efficiently
card_impact = {card: winner_counts[card] / total_counts[card] for card in total_counts}

# Convert to DataFrame
card_impact_df = (
    pd.DataFrame.from_dict(card_impact, orient='index', columns=['impact'])
    .sort_values('impact', ascending=False)
)

print("Top 10 cards by impact score:")
print(card_impact_df.head(10))

print("Bottom 10 cards by impact score:")
print(card_impact_df.tail(10))


In [None]:
card_frequency = Counter(all_cards)
freq_df = pd.DataFrame.from_dict(card_frequency, orient='index', columns=['count']).sort_values('count', ascending=False)
print("Top 10 most used cards:")
print(freq_df.head(10))


In [None]:
deck_stats = pd.DataFrame({
    'wins': pd.Series(deck_wins),
    'losses': pd.Series(deck_losses)
}).fillna(0)

deck_stats['total'] = deck_stats['wins'] + deck_stats['losses']
deck_stats['win_rate'] = deck_stats['wins'] / deck_stats['total']

print("Top 10 decks by win rate (min 50 matches):")
print(deck_stats[deck_stats['total'] >= 50].sort_values('win_rate', ascending=False).head(10))

#looks like there's only 7 id's for some but its bc it has same 1st one as line above

In [None]:
# Count number of cards per deck
deck_stats['deck_size'] = deck_stats.index.map(len)

# See how many have less than 8 cards
invalid_decks = deck_stats[deck_stats['deck_size'] < 8]

print("Decks with fewer than 8 cards:")
print(invalid_decks.head(10))
print(f"Total incomplete decks: {len(invalid_decks)}")