<h3><strong>Import Libraries</strong></h3>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import gc
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

In [None]:
# Change Pandas Display Options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)  

<h3><strong>DO NOT RUN UNLESS YOU WANT A NEW SAMPLE</strong></h3>

In [None]:
# # Create a sample size we want
# sample_fraction = 500000 / 16000000

# # Read a sample of the csv
# df_sample = pd.read_csv('battles.csv', skiprows=lambda i: i > 0 and random.random() > sample_fraction)

# # Create new csv for sample
# df_sample.to_csv('battles_sample_500k.csv', index=False)

<h3><strong>Read and Verify CSV</strong></h3>

In [None]:
# Read CSV
df = pd.read_csv('battles_sample_500k.csv')

# Print first five lines of CSV
print(df.head())

<h3><strong>Data Cleaning & Initial Inspection</strong></h3>

In [None]:
# Shape of Dataframe
print(df.shape)

In [None]:
# All Columns in Dataframe
cols = df.columns.tolist()
print(cols)

In [None]:
# Print Numerical Columns
numerical_cols = df.select_dtypes(include=np.number).columns
print(numerical_cols)

In [None]:
# Print Data Types of Each Column
print(df.dtypes)

In [None]:
# Null Values
print(df.isnull().sum())

In [None]:
# Print Number of Unique Values
print(df.nunique())

<h5>Handling Null Values & Dropping Uninfluential Columns</h5>

In [None]:
# Dropping Columns (Columns with not mutch meaning/influence)
df = df.drop(columns=['Unnamed: 0', 'winner.tag', 'winner.clan.tag', 'winner.clan.badgeId', 'loser.tag', 'loser.clan.tag', 'loser.clan.badgeId', 'tournamentTag' ])

In [None]:
# Resolving Null Values to 0
df['winner.princessTowersHitPoints'] = df['winner.princessTowersHitPoints'].fillna(0)
df['loser.kingTowerHitPoints'] = df['loser.kingTowerHitPoints'].fillna(0)
df['loser.princessTowersHitPoints'] = df['loser.princessTowersHitPoints'].fillna(0)

In [None]:
# Reason we are able to resolve null values to 0

# Only appears when loser crowns is 2 which means that both winner and loser princess towers are destroyed (0 hitpoints)
print('Winner Princess Tower Hit Points to Loser Crowns:')
print(df.groupby(df['winner.princessTowersHitPoints'].isna())['loser.crowns'].describe())

# Null values appear when winner crowns is 3, which means that the king tower was destroyed (0 hit points)
print('\nLoser King Tower Hit Points to Winner Crowns:')
print(df.groupby(df['loser.kingTowerHitPoints'].isna())['winner.crowns'].describe())

# # Null values appear only when both princess towers are destroyed (0 hit points)
print('\nLoser Princess Tower Hit Points to Winner Crowns:')
print(df.groupby(df['loser.princessTowersHitPoints'].isna())['winner.crowns'].describe())

In [None]:
# Check for null values again
print(df.isnull().sum())

<h3><strong>Exploratory Data Analysis</strong></h3>

In [None]:
# Summary Statistics
print(df.describe())

In [None]:
# Correlation Heatmap (How much does one variable affect the other)
plt.figure(figsize=(20, 20))
co_mtx = df.corr(numeric_only=True)
sns.heatmap(co_mtx, cmap='coolwarm', fmt='.2f', annot=True)
plt.show()

In [None]:
# Map out card ID's to actual card names

card_map = {
    26000000: "Knight", 26000001: "Archers", 26000002: "Goblins", 26000003: "Giant",
    26000004: "P.E.K.K.A", 26000005: "Minions", 26000006: "Balloon", 26000007: "Witch",
    26000008: "Barbarians", 26000009: "Golem", 26000010: "Skeletons", 26000011: "Valkyrie",
    26000012: "Skeleton Army", 26000013: "Bomber", 26000014: "Musketeer", 26000015: "Baby Dragon",
    26000016: "Prince", 26000017: "Wizard", 26000018: "Mini P.E.K.K.A", 26000019: "Spear Goblins",
    26000020: "Giant Skeleton", 26000021: "Hog Rider", 26000022: "Minion Horde", 26000023: "Ice Wizard",
    26000024: "Royal Giant", 26000025: "Guards", 26000026: "Princess", 26000027: "Dark Prince",
    26000028: "Three Musketeers", 26000029: "Lava Hound", 26000030: "Ice Spirit", 26000031: "Fire Spirits",
    26000032: "Miner", 26000033: "Sparky", 26000034: "Bowler", 26000035: "Lumberjack", 26000036: "Battle Ram",
    26000037: "Inferno Dragon", 26000038: "Ice Golem", 26000039: "Mega Minion", 26000040: "Dart Goblin",
    26000041: "Goblin Gang", 26000042: "Electro Wizard", 26000043: "Elite Barbarians", 26000044: "Hunter",
    26000045: "Executioner", 26000046: "Bandit", 26000047: "Royal Recruits", 26000048: "Night Witch",
    26000049: "Bats", 26000050: "Royal Ghost", 26000051: "Ram Rider", 26000052: "Zappies",
    26000053: "Rascals", 26000054: "Cannon Cart", 26000055: "Mega Knight", 26000056: "Skeleton Barrel",
    26000057: "Flying Machine", 26000058: "Wall Breakers", 26000059: "Royal Hogs", 26000060: "Goblin Giant",
    26000061: "Fisherman", 26000062: "Magic Archer", 26000063: "Electro Dragon", 26000064: "Firecracker",
    26000067: "Elixir Golem", 26000068: "Battle Healer", 26000080: "Skeleton Dragons", 26000083: "Mother Witch",
    26000084: "Electro Spirit", 26000085: "Electro Giant", 27000000: "Cannon", 27000001: "Goblin Hut",
    27000002: "Mortar", 27000003: "Inferno Tower", 27000004: "Bomb Tower", 27000005: "Barbarian Hut",
    27000006: "Tesla", 27000007: "Elixir Collector", 27000008: "X-Bow", 27000009: "Tombstone",
    27000010: "Furnace", 27000012: "Goblin Cage", 28000000: "Fireball", 28000001: "Arrows",
    28000002: "Rage", 28000003: "Rocket", 28000004: "Goblin Barrel", 28000005: "Freeze",
    28000006: "Mirror", 28000007: "Lightning", 28000008: "Zap", 28000009: "Poison",
    28000010: "Graveyard", 28000011: "The Log", 28000012: "Tornado", 28000013: "Clone",
    28000014: "Earthquake", 28000015: "Barbarian Barrel", 28000016: "Heal Spirit",
    28000017: "Giant Snowball", 28000018: "Royal Delivery"
}

card_cols = [
    'winner.card1.id', 'winner.card2.id', 'winner.card3.id', 'winner.card4.id',
    'winner.card5.id', 'winner.card6.id', 'winner.card7.id', 'winner.card8.id',
    'loser.card1.id', 'loser.card2.id', 'loser.card3.id', 'loser.card4.id',
    'loser.card5.id', 'loser.card6.id', 'loser.card7.id', 'loser.card8.id'
]

for col in card_cols:
    df[col] = df[col].map(card_map)

# Check first few replacements
print(df[card_cols].head())


In [None]:
# Bar Plot of Cards used the most
all_cards = df[card_cols].values.flatten()
card_counts = pd.Series(all_cards).value_counts()


plt.figure(figsize=(14, 10))
sns.barplot(x=card_counts.index, y=card_counts.values)
plt.title("Frequency of Cards Used in All Matches")
plt.xlabel("Card ID")
plt.ylabel("Usage Count")
plt.xticks(rotation=90)
plt.show()

In [None]:
# Show the ten most used cards
top10 = card_counts.head(10)

plt.figure(figsize=(14, 10))
sns.barplot(x=top10.index, y=top10.values)
plt.title("Frequency of Cards Used in All Matches")
plt.xlabel("Card ID")
plt.ylabel("Usage Count")
plt.xticks(rotation=90)
plt.show()

In [None]:
# Box plot of card levels
card_levels = [f'winner.card{i}.level' for i in range(1, 9)] + [f'winner.card{i}.level' for i in range(1, 9)]

plt.figure(figsize=(14, 10))
sns.boxplot(data=df[card_levels])
plt.title('Distribution of Card Levels')
plt.ylabel('Card Level')
plt.show()

In [None]:
# Box plot of Winner Elixir Average vs Loser Elixir Average
plt.figure(figsize=(14, 10))
sns.boxplot(data=df[['winner.elixir.average', 'loser.elixir.average']])
plt.title("Elixir Average Distribution (Winner vs Loser)")
plt.ylabel("Elixir Average")
plt.show()

# Shows that elixir average alone doesn't determine match outcomes

<h3><strong>Data Prep for Model Building</strong></h3>

In [None]:
print(df.columns)

<h5>Creating a new dataframe with winning and losing cards in different rows and if the deck won (won = 0 or 1)</h5>

In [None]:
# Create 2 dataframes, one with winning decks, another with losing decks and their outcomes

win_df = df[['winner.card1.id', 'winner.card1.level', 'winner.card2.id', 'winner.card2.level', 'winner.card3.id', 
             'winner.card3.level', 'winner.card4.id', 'winner.card4.level', 'winner.card5.id', 'winner.card5.level', 
             'winner.card6.id', 'winner.card6.level', 'winner.card7.id', 'winner.card7.level', 'winner.card8.id', 
             'winner.card8.level', 'winner.totalcard.level', 'winner.troop.count', 'winner.structure.count', 
             'winner.spell.count', 'winner.common.count', 'winner.rare.count', 'winner.epic.count', 'winner.legendary.count', 
             'winner.elixir.average']].copy()
win_df['target'] = 1

lose_df = df[['loser.card1.id', 'loser.card1.level', 'loser.card2.id', 'loser.card2.level', 'loser.card3.id',
              'loser.card3.level', 'loser.card4.id', 'loser.card4.level', 'loser.card5.id', 'loser.card5.level', 
              'loser.card6.id', 'loser.card6.level', 'loser.card7.id', 'loser.card7.level', 'loser.card8.id', 
              'loser.card8.level', 'loser.totalcard.level', 'loser.troop.count', 'loser.structure.count', 
              'loser.spell.count', 'loser.common.count', 'loser.rare.count', 'loser.epic.count', 'loser.legendary.count', 
              'loser.elixir.average']].copy()
lose_df['target'] = 0


In [None]:
# Transform win_df so that each column is each and every card in the dataset and will either have a value of 0 or its level depending on if the card is in that deck

win_card_cols = [f'winner.card{i}.id' for i in range(1, 9)]
win_card_levels = [f'winner.card{i}.level' for i in range(1, 9)]

win_cards_long = pd.DataFrame({
    'deck': win_df.index.repeat(8),
    'card_id': win_df[win_card_cols].values.flatten(),
    'card_level': win_df[win_card_levels].values.flatten(),
    'target': win_df['target'].repeat(8)
})

win_cards_long = win_cards_long.pivot_table(
    index = 'deck',
    columns = 'card_id',
    values = 'card_level',
    fill_value = 0
)

win_numeric_cols = win_df[['winner.totalcard.level', 'winner.troop.count', 'winner.structure.count', 
             'winner.spell.count', 'winner.common.count', 'winner.rare.count', 'winner.epic.count', 'winner.legendary.count', 
             'winner.elixir.average']].reset_index(drop=True)

win_numeric_cols = win_numeric_cols.rename(columns={
    'winner.totalcard.level': 'totalcard_level',
    'winner.troop.count': 'troop_count',
    'winner.structure.count': 'structure_count',
    'winner.spell.count': 'spell_count',
    'winner.common.count': 'common_count',
    'winner.rare.count': 'rare_count',
    'winner.epic.count': 'epic_count',
    'winner.legendary.count': 'legendary_count',
    'winner.elixir.average': 'elixir_average'
})

final_win_df = pd.concat([win_numeric_cols, win_cards_long.reset_index(drop=True)], axis=1)
final_win_df['target'] = win_df['target'].values

print(final_win_df.head())
print(final_win_df.shape)


In [None]:
# Transform lose_df so that each column is each and every card in the dataset and will either have a value of 0 or its level depending on if the card is in that deck

lose_card_cols = [f'loser.card{i}.id' for i in range(1, 9)]
lose_card_levels = [f'loser.card{i}.level' for i in range(1, 9)]


lose_cards_long = pd.DataFrame({
    'deck': lose_df.index.repeat(8),
    'card_id': lose_df[lose_card_cols].values.flatten(),
    'card_level': lose_df[lose_card_levels].values.flatten(),
    'target': lose_df['target'].repeat(8)
})

lose_cards_long = lose_cards_long.pivot_table(
    index = 'deck',
    columns = 'card_id',
    values = 'card_level',
    fill_value = 0
)

lose_numeric_cols = lose_df[['loser.totalcard.level', 'loser.troop.count', 'loser.structure.count', 
              'loser.spell.count', 'loser.common.count', 'loser.rare.count', 'loser.epic.count', 'loser.legendary.count', 
              'loser.elixir.average']].reset_index(drop=True)

lose_numeric_cols = lose_numeric_cols.rename(columns={
    'loser.totalcard.level': 'totalcard_level',
    'loser.troop.count': 'troop_count',
    'loser.structure.count': 'structure_count',
    'loser.spell.count': 'spell_count',
    'loser.common.count': 'common_count',
    'loser.rare.count': 'rare_count',
    'loser.epic.count': 'epic_count',
    'loser.legendary.count': 'legendary_count',
    'loser.elixir.average': 'elixir_average'
})


final_lose_df = pd.concat([lose_numeric_cols, lose_cards_long.reset_index(drop=True)], axis=1)
final_lose_df['target'] = lose_df['target'].values


print(final_lose_df.head())
print(final_lose_df.shape)

In [None]:
# Combine both lose_df and win_df
X1 = pd.concat([final_lose_df, final_win_df], axis=0).reset_index(drop=True)
y1 = X1.pop('target')

print(X1.head())
print(X1.shape)

# Garbage collect unused dataframes to save memory
del lose_df, lose_cards_long, lose_numeric_cols, final_lose_df, win_df, win_cards_long, win_numeric_cols, final_win_df
gc.collect()

<h5>Scale First Dataset</h5>

In [None]:
# Scale Data
scaler = StandardScaler()
cols = X1.columns
X1 = pd.DataFrame(scaler.fit_transform(X1))

X1.columns = cols
print(X1.head())
print(X1.shape)


<h5>Creating a new dataframe which includes winning and losing cards from each match on the same row and which deck won (won = 0 or 1)</h5>

<h5>Scale Second Dataset</h5>

<h5>Train Test Split</h5>

In [None]:
# Perform an 80/20 split on the first dataset (80% train, 20% test)

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, shuffle=True, random_state=42, stratify=y1)


In [None]:
# Perform an 80/20 split on the second dataset (80% train, 20% test)

# X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, shuffle=True, random_state=42)

<h3><strong>Model Building</strong></h3>

<h5>Predicting overall win rate of one deck (Logistic Regression)</h5>

In [None]:
# Train Model
lr_model = LogisticRegression(n_jobs=-1, random_state=42)

lr_model.fit(X1_train, y1_train)
score = lr_model.score(X1_test, y1_test)

print(score)

<h5>Predicting overall win rate of deck A vs deck B (Random Forest)</h5>

<h3><strong>User Functions to Predict Win Rate</strong></h3>

<h5>Predict Win Rate of a Deck</h5>

In [None]:
def predictWinRateDeck(deck_cards, deck_levels, model=lr_model, scaler=scaler, feature_columns=cols):
    deck_df = pd.DataFrame(columns=feature_columns)
    deck_df.loc[0] = 0

    for card, level in zip(deck_cards, deck_levels):
        if card in deck_df.columns:
            deck_df.loc[0, card] = level
    
    deck_df = pd.DataFrame(scaler.transform(deck_df))
    deck_df.columns = feature_columns

    win_rate = model.predict_proba(deck_df)[0][1]
    return round(float(win_rate), 4)

In [None]:
cards = ['Archers', 'Baby Dragon', 'Fireball', 'Knight', 'Ice Spirit', 'Musketeer', 'Goblin Gang', 'Tesla']
levels = [13, 13, 13, 13, 13, 13, 13, 13]

win_rate = predictWinRateDeck(cards, levels)
print(f'Predicted win probability: {win_rate:.2%}')