In [3]:
import json
import numpy as np
import os
from pathlib import Path
import pandas as pd
from ast import literal_eval
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.feature_extraction import FeatureHasher


In [14]:

import sc2reader
from sc2reader.engine.plugins import APMTracker, ContextLoader, SelectionTracker
from sc2reader.events import PlayerStatsEvent, UnitBornEvent, UnitDiedEvent, UnitDoneEvent, UnitTypeChangeEvent, UpgradeCompleteEvent


In [15]:
replay_file = "test.SC2Replay"
replay = sc2reader.load_replay(
    replay_file,
    engine=sc2reader.engine.GameEngine(plugins=[ContextLoader(), APMTracker(), SelectionTracker()]))

print("Replay successfully loaded.")

Replay successfully loaded.


In [16]:
print("Date: %s" % replay.date)
print("Map Name: " + replay.map_name)
for player in replay.players:
    print("%s: %s" % (player.result, player))

Date: 2024-04-06 22:35:51
Map Name: Crimson Court LE
Loss: Player 1 - Kenobi (Terran)
Win: Player 2 - Jackal (Zerg)


In [17]:
# Establish some unit and building groups

VESPENE_UNITS = ["Assimilator", "Extractor", "Refinery"]

SUPPLY_UNITS = ["Overlord", "Overseer", "Pylon", "SupplyDepot"]

WORKER_UNITS = ["Drone", "Probe", "SCV", "MULE"]

BASE_UNITS = ["CommandCenter", "Nexus", "Hatchery", "Lair", "Hive", "PlanetaryFortress", "OrbitalCommand"]

GROUND_UNITS = ["Barracks", "Factory", "GhostAcademy", "Armory", "RoboticsBay", "RoboticsFacility", "TemplarArchive",
                "DarkShrine", "WarpGate", "SpawningPool", "RoachWarren", "HydraliskDen", "BanelingNest", "UltraliskCavern",
                "LurkerDen", "InfestationPit"]

AIR_UNITS = ["Starport", "FusionCore", "RoboticsFacility", "Stargate", "FleetBeacon", "Spire", "GreaterSpire"]

TECH_UNITS = ["EngineeringBay", "Armory", "GhostAcademy", "TechLab", "FusionCore", "Forge", "CyberneticsCore",
              "TwilightCouncil", "RoboticsFacility", "RoboticsBay", "FleetBeacon", "TemplarArchive", "DarkShrine",
              "SpawningPool", "RoachWarren", "HydraliskDen", "BanelingNest", "UltraliskCavern", "LurkerDen", "Spire",
              "GreaterSpire", "EvolutionChamber", "InfestationPit"]

ARMY_UNITS = ["Marine", "Colossus", "InfestorTerran", "Baneling", "Mothership", "MothershipCore", "Changeling", "SiegeTank", "Viking", "Reaper",
              "Ghost", "Marauder", "Thor", "Hellion", "Hellbat", "Cyclone", "Liberator", "Medivac", "Banshee", "Raven", "Battlecruiser", "Nuke", "Zealot",
              "Stalker", "HighTemplar", "Disruptor", "DarkTemplar", "Sentry", "Phoenix", "Carrier", "Oracle", "VoidRay", "Tempest", "WarpPrism", "Observer",
              "Immortal", "Adept", "Zergling", "Overlord", "Hydralisk", "Mutalisk", "Ultralisk", "Roach", "Infestor", "Corruptor",
              "BroodLord", "Queen", "Overseer", "Archon", "Broodling", "InfestedTerran", "Ravager", "Viper", "SwarmHost"]

ARMY_AIR = ["Mothership", "MothershipCore", "Viking", "Liberator", "Medivac", "Banshee", "Raven", "Battlecruiser",
            "Viper", "Mutalisk", "Phoenix", "Oracle", "Carrier", "VoidRay", "Tempest", "Observer", "WarpPrism", "BroodLord",
            "Corruptor", "Observer", "Overseer"]

ARMY_GROUND = [k for k in ARMY_UNITS if k not in ARMY_AIR]

In [4]:
df = pd.read_json('data-old/replay_summaries.json')
df.head()

Unnamed: 0,path,total_gameloops,gameloop,build,winner,map,player_1,player_2,player_1_units,player_2_units
0,ASUS ROG Online 2020\ASUS_ROG_Online_2020_repl...,24725,23381,82457,2.0,Romanticide LE,&lt;Ex0n&gt;<sp/>MaxPax,Rogue,"[Nexus, Probe, Probe, Probe, Probe, Probe, Pro...","[Hatchery, Drone, Drone, Drone, Drone, Drone, ..."
1,ASUS ROG Online 2020\ASUS_ROG_Online_2020_repl...,18078,16734,82457,2.0,Oxide LE,&lt;Ex0n&gt;<sp/>MaxPax,Rogue,"[Nexus, Probe, Probe, Probe, Probe, Probe, Pro...","[Hatchery, Drone, Drone, Drone, Drone, Drone, ..."
2,ASUS ROG Online 2020\ASUS_ROG_Online_2020_repl...,12568,11224,82457,1.0,Deathaura LE,Rogue,&lt;Ex0n&gt;<sp/>MaxPax,"[Hatchery, Larva, Larva, Larva, Drone, Drone, ...","[Nexus, Probe, Probe, Probe, Probe, Probe, Pro..."
3,ASUS ROG Online 2020\ASUS_ROG_Online_2020_repl...,10201,8857,82457,2.0,Pillars of Gold LE,&lt;인투더&gt;<sp/>SpeCial,PartinG,"[CommandCenter, SCV, SCV, SCV, SCV, SCV, SCV, ...","[Nexus, Probe, Probe, Probe, Probe, Probe, Pro..."
4,ASUS ROG Online 2020\ASUS_ROG_Online_2020_repl...,19992,18648,82457,1.0,Romanticide LE,&lt;인투더&gt;<sp/>SpeCial,PartinG,"[CommandCenter, SCV, SCV, SCV, SCV, SCV, SCV, ...","[Nexus, Probe, Probe, Probe, Probe, Probe, Pro..."


In [212]:
# from openai import OpenAI

# import os

# client = OpenAI()

# def get_embedding(text, model="text-embedding-3-small"):
#    text = text.replace("\n", " ")
#    return client.embeddings.create(input = [text], model=model).data[0].embedding

# df['ada_embedding'] = df.combined.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))
# # df.to_csv('output/embedded_1k_reviews.csv', index=False)


In [None]:
','.join(df['player_1_units'][0])

'Nexus,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Adept,Probe,Probe,Probe,Probe,VoidRay,Adept,Probe,Probe,Probe,Probe,Probe,Oracle,Probe,Probe,Probe,Probe,Probe,Oracle,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,VoidRay,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Probe,Immortal,Probe,Immortal,ChangelingZealot,ChangelingZealot,Immortal,WarpPrism,ChangelingZealot,Observer,Probe,Probe,Probe,Colossus,Probe,ChangelingZealot,Probe,Probe,Probe,Probe,Colossus,Probe,Probe,Colossus,Probe,Probe,Observer,Probe,Probe,Probe,Disruptor,ChangelingZealot,ChangelingZealot,ChangelingZealot,Colossus,Probe,Probe,ChangelingZealot,Disruptor,Disruptor,Disruptor,Disruptor,ChangelingZealot,ChangelingZealot,Disruptor,ChangelingZealot,Disruptor,Disruptor,Probe,Disruptor,Disruptor,Oracle,Disruptor,VoidRay,Disruptor,ChangelingZealot,ChangelingZealot,ChangelingZealot,O

In [42]:
df['player_1_units'] = df['player_1_units'].str.join(',')
df['player_2_units'] = df['player_2_units'].str.join(',')

In [None]:
for seq in df['player_1_units'].values:
    seq_embeddings = get_embedding(seq)
    break

In [214]:
# df['p1_embedding'] = df['player_1_units'].apply(lambda x: get_embedding(x.join(','), model='text-embedding-3-small'))

In [7]:
from sentence_transformers import SentenceTransformer


  from .autonotebook import tqdm as notebook_tqdm


In [10]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')



In [11]:
def get_embedding(text, model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')):
    return model.encode(text)

In [12]:
# TESTED - 90% Accuracy
df['p1_embedding'] = df['player_1_units'].apply(lambda x: get_embedding(','.join(x), model=model))
df['p2_embedding'] = df['player_2_units'].apply(lambda x: get_embedding(','.join(x), model=model))

# df['p2_agg_embedding'] = df['agg_feature_p2'].apply(lambda x: get_embedding(x, model=model))
# df['p1_agg_embedding'] = df['agg_feature_p1'].apply(lambda x: get_embedding(x, model=model)
# df['unit_agg'] = df['unit_agg'].apply(lambda x: get_embedding(x, model=model))


In [160]:
df['embed_dif'] = df['p1_embedding'] - df['p2_embedding']

In [126]:
p1_df = df[df['winner'] == 1][['p1_embedding', 'build', 'total_gameloops', 'winner']]
p1_df.columns = ['embeddings', 'build', 'total_gameloops', 'winner']
p2_df = df[df['winner'] == 2][['p2_embedding', 'build', 'total_gameloops', 'winner']]
p2_df.columns = ['embeddings', 'build', 'total_gameloops', 'winner']

In [161]:
# final_df = pd.concat([p1_df, p2_df]).sort_index()

# final_df = df[['p1_embedding', 'build', 'total_gameloops', 'winner']].copy()
# final_df.columns = ['embeddings', 'build', 'total_gameloops', 'winner']
# final_df.dropna(inplace=True)
df['embed_dif'] = df['p1_embedding'] - df['p2_embedding']

final_df = df[['embed_dif', 'winner']].copy()
final_df.columns = ['embeddings', 'winner']


In [210]:
df[[ 'build', 'winner', 'player_1', 'player_2', 'player_1_units', 'player_2_units']][:5].to_csv('test.csv', index=False)

In [204]:
# temp_df = pd.DataFrame(df['p1_embedding'].tolist(), index= df.index)
# temp_df['winner'] = df['winner'].values
# temp_df
# pd.DataFrame(df['p1_embedding'].tolist(), index= df.index)

Unnamed: 0,0
0,[ 1.45169543e-02 -6.07941151e-02 -4.89254519e-...
1,[ 2.62572877e-02 -8.35469738e-02 -2.40886137e-...
2,[ 4.30721045e-02 -5.66701740e-02 -2.65929829e-...
3,[-2.03307923e-02 -6.20039031e-02 -7.17267394e-...
4,[-4.21783030e-02 -6.19712546e-02 -5.99170811e-...
...,...
2519,[ 4.45341021e-02 -5.91899455e-02 -2.57531796e-...
2520,[ 5.23572788e-02 -6.01047203e-02 -2.99987681e-...
2521,[ 4.11112905e-02 -5.73762543e-02 -2.59615872e-...
2522,[ 4.40342277e-02 -5.25588989e-02 -2.68702731e-...


In [None]:
train_df.dropna(inplace=True)

train_df.columns = train_df.columns.astype(str)
test_df = train_df.sample(int(train_df.shape[0]*0.1))


X = train_df[~train_df.index.isin(test_df.index)].drop('winner', axis=1)
y = train_df[~train_df.index.isin(test_df.index)]['winner'] - 1  # Adjusting target to 0-based



# X = train_df.drop('winner', axis=1)
# y = train_df['winner'] - 1  # Adjusting target to 0-based


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_test = test_df.drop('winner', axis=1)
y_test = test_df['winner'] - 1


In [273]:
  # Adjusting target to 0-based

df.isna().sum()

path               0
total_gameloops    0
gameloop           0
build              0
winner             6
map                0
player_1           0
player_2           0
player_1_units     0
player_2_units     0
p1_embedding       0
p2_embedding       0
embed_dif          0
dtype: int64

In [343]:
df['embed_dif'] = df['p1_embedding'] - df['p2_embedding']

final_df = df[['embed_dif', 'map', 'build', 'total_gameloops', 'winner']].copy()
final_df.columns = ['embeddings', 'map', 'build', 'total_gameloops', 'winner']

train_df = pd.DataFrame(final_df['embeddings'].tolist(), index= final_df.index)


In [344]:
final_df = pd.concat([final_df.drop('embeddings', axis=1), train_df], axis=1)
final_df = final_df[final_df['map'] != 'TEST__DOCUMENT']
final_df.dropna(inplace=True)

In [345]:
# train_df['winner'] = final_df['winner'].values
# train_df['map'] = df['map'].values
# train_df['total_gameloops'] = df['total_gameloops'].values
# train_df['build'] = df['build'].values
final_df['build'] = final_df['build'].astype(str)

X = final_df.drop('winner', axis=1)
y = final_df['winner'] - 1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [330]:
X_train.isna().sum()

map                0
build              0
total_gameloops    0
0                  0
1                  0
                  ..
379                0
380                0
381                0
382                0
383                0
Length: 387, dtype: int64

In [331]:
X_train[X_train['map'].isna()]

Unnamed: 0,map,build,total_gameloops,0,1,2,3,4,5,6,...,374,375,376,377,378,379,380,381,382,383


In [346]:
cols = [str(col) for col in X_train.columns if col not in  ['winner', 'map', 'build']]
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

scaler = StandardScaler()
scaler.fit(X_train[cols])
X_train[cols] = scaler.transform(X_train[cols])
X_test[cols] = scaler.transform(X_test[cols])

In [347]:
from category_encoders import TargetEncoder, OneHotEncoder


target_encoding_cols = ['map', 'build']
encoder = TargetEncoder(cols = target_encoding_cols)
encoder.fit(X_train[target_encoding_cols], y_train.values)

X_train[target_encoding_cols] = encoder.transform(X_train[target_encoding_cols])
X_test[target_encoding_cols] = encoder.transform(X_test[target_encoding_cols])

In [334]:
X_train[X_train['map'].isna()]

Unnamed: 0,map,build,total_gameloops,0,1,2,3,4,5,6,...,374,375,376,377,378,379,380,381,382,383


In [335]:
df['map'].unique()

array(['Romanticide LE', 'Oxide LE', 'Deathaura LE', 'Pillars of Gold LE',
       'Lightshade LE', 'Jagannatha LE', 'Submarine LE', '데스오라 - 래더',
       '라이트쉐이드 - 래더', '로맨티사이드 - 래더', '서브머린 - 래더', '옥사이드 - 래더',
       '자가나타 - 래더', '필러스 오브 골드 - 래더', 'Beckett Industries LE',
       '2000 Atmospheres LE', '2000大氣壓力 - 天梯版', '羅曼死 - 天梯版', '札格納特 - 天梯版',
       'Blackburn LE', '紫晶浪漫-天梯版', '大气2000-天梯版', '黑色燃烧-天梯版', '贝克特工业-天梯版',
       '锈化山巅-天梯版', '世界主宰-天梯版', '光影交错-天梯版', 'Ephemeron LE',
       'Nightshade LE', 'Simulacrum LE', 'World of Sleepers LE', 'Zen LE',
       'Eternal Empire LE', 'Triton LE', '시뮬레이크럼 - 래더', '이페머론 - 래더',
       '나이트쉐이드 - 래더', '이터널 엠파이어 - 래더', '트라이튼 - 래더', '월드 오브 슬리퍼스 - 래더',
       '젠 - 래더', 'Nocny Mrok ER', 'Efemeryda ER', 'Triton EC',
       'Empire éternel EC', 'Domaine des dormeurs EC', '毒茄樹叢 - 天梯版',
       '休眠者之境 - 天梯版', '海神信使 - 天梯版', '永恆帝國 - 天梯版', 'Ewiges Imperium LE',
       'Welt der Schläfer LE', 'Ever Dream LE', 'Golden Wall LE',
       '에버 드림 - 래더', 'Purity and Ind

In [348]:
X = pd.concat([X_train, X_test], axis=0)
y = pd.concat([y_train, y_test], axis=0)

In [341]:
y.isna().sum()

4

In [349]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# prepare the cross-validation procedure
cv = KFold(n_splits=5, random_state=1, shuffle=True)
# create model
model = LogisticRegression()
# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))


model = RandomForestClassifier(n_estimators=100)
# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.606 (0.016)


Accuracy: 0.618 (0.011)


In [2]:
model = SVC(gamma='auto')
# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

NameError: name 'cross_val_score' is not defined

In [228]:
train_df['map'] = df['map'].values

# ohe = OneHotEncoder(sparse_output=False)
# map_encoded = ohe.fit_transform(train_df[['map']])

# map_encoded_df = pd.DataFrame(map_encoded, columns=ohe.get_feature_names_out(['map']), index=train_df.index)
# train_df = pd.concat([train_df.drop('map', axis=1), map_encoded_df], axis=1)

  train_df['map'] = df['map'].values


In [231]:
train_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,378,379,380,381,382,383,winner,total_gameloops,build,map
0,-0.490679,1.750192,-0.495058,1.036633,-0.682884,-1.622613,0.099752,0.862536,-0.927759,-0.780038,...,0.551306,-0.583378,0.279552,-0.354771,-0.391886,-0.672420,2.0,1.096844,0.131710,1.501754
1,0.042937,1.019797,0.213319,0.512174,0.652828,-1.451873,-0.431510,-0.244751,-0.386331,1.078594,...,0.983163,0.592824,1.236903,0.224464,-1.213572,-0.364286,2.0,0.226710,0.131710,1.498127
2,0.048653,1.052163,0.695771,-0.967585,0.278356,1.668354,-0.034601,0.504739,0.769443,2.653547,...,-0.382399,-0.540342,-1.074837,0.760264,0.278851,0.878152,1.0,-0.494583,0.131710,1.496933
3,-1.144252,0.479412,-1.395915,-1.572050,-1.306028,0.921969,-0.784882,1.323840,-0.890532,-0.212877,...,-2.193267,-1.326865,-0.539619,0.221812,1.041486,0.914849,2.0,-0.804438,0.131710,1.517007
4,-1.470412,1.290371,-0.495829,-1.832286,-1.192710,1.244555,-1.494436,1.734607,0.013478,-1.082828,...,-1.411573,-1.861036,-1.478285,0.311207,0.403000,0.827134,1.0,0.477265,0.131710,1.501754
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2519,1.155758,0.402191,1.497073,0.389178,-0.578597,0.419794,1.798646,-1.696839,0.906561,1.342602,...,1.412227,0.951413,0.335720,-0.333501,0.786178,-0.069398,2.0,0.609218,0.294182,1.517007
2520,0.490209,-0.725072,-0.114175,-0.457738,1.197821,1.051277,0.232435,-1.033771,3.354834,1.290349,...,-1.185296,2.351648,-0.178291,0.143230,0.772057,0.397531,2.0,4.661288,0.294182,1.501754
2521,0.265173,0.514021,0.587258,-0.643408,0.595532,1.287039,0.229322,-0.094544,1.193985,0.547074,...,-0.257501,0.089830,-1.743256,1.761811,-0.410492,1.115826,1.0,-0.145194,0.294182,1.522388
2522,0.331396,-0.117938,0.199740,-0.417706,2.266569,1.971858,-0.607278,-0.300842,2.335385,1.869734,...,-0.503499,-0.261925,-0.251555,-0.127957,-0.426449,0.976525,1.0,-0.364986,0.294182,1.498127


In [229]:
train_df['map']

0           Romanticide LE
1                 Oxide LE
2             Deathaura LE
3       Pillars of Gold LE
4           Romanticide LE
               ...        
2519    Pillars of Gold LE
2520        Romanticide LE
2521         Jagannatha LE
2522              Oxide LE
2523         Lightshade LE
Name: map, Length: 2524, dtype: object

In [214]:
train_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,map_시뮬레이크럼 - 래더,map_에버 드림 - 래더,map_옥사이드 - 래더,map_월드 오브 슬리퍼스 - 래더,map_이터널 엠파이어 - 래더,map_이페머론 - 래더,map_자가나타 - 래더,map_젠 - 래더,map_트라이튼 - 래더,map_필러스 오브 골드 - 래더
0,-0.005736,-0.078801,0.023953,-0.016981,0.029656,-0.072856,0.008278,-0.020970,-0.038879,-0.530037,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.028649,-0.056277,-0.016154,-0.013596,0.026374,-0.031563,0.002494,-0.025447,0.025952,0.794390,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.031764,-0.068439,-0.029010,0.186214,0.027172,0.025963,0.006615,-0.018449,0.132115,3.432483,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.055471,-0.051367,0.121567,0.176901,0.019560,0.027177,-0.005501,-0.023116,0.012673,-0.228637,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.096490,-0.070425,0.014591,0.426744,0.015771,0.031131,-0.007495,-0.020496,0.050307,-0.791174,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2519,-0.402236,-0.050194,-0.038252,0.008293,0.032042,-0.012174,-0.264430,-0.020411,0.189326,0.821420,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2520,0.070721,0.005766,0.004883,0.049122,0.025552,0.022769,0.010133,-0.021542,-0.035273,0.878505,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2521,0.049985,-0.054427,-0.027161,0.073265,0.026396,0.027258,0.010282,-0.016964,0.722721,0.199372,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2522,0.056316,-0.031990,-0.014948,0.045729,0.024707,0.026827,0.001959,-0.014463,-0.073942,1.617246,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [219]:
train_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,map_시뮬레이크럼 - 래더,map_에버 드림 - 래더,map_옥사이드 - 래더,map_월드 오브 슬리퍼스 - 래더,map_이터널 엠파이어 - 래더,map_이페머론 - 래더,map_자가나타 - 래더,map_젠 - 래더,map_트라이튼 - 래더,map_필러스 오브 골드 - 래더
0,-0.005736,-0.078801,0.023953,-0.016981,0.029656,-0.072856,0.008278,-0.020970,-0.038879,-0.530037,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.028649,-0.056277,-0.016154,-0.013596,0.026374,-0.031563,0.002494,-0.025447,0.025952,0.794390,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.031764,-0.068439,-0.029010,0.186214,0.027172,0.025963,0.006615,-0.018449,0.132115,3.432483,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.055471,-0.051367,0.121567,0.176901,0.019560,0.027177,-0.005501,-0.023116,0.012673,-0.228637,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.096490,-0.070425,0.014591,0.426744,0.015771,0.031131,-0.007495,-0.020496,0.050307,-0.791174,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2519,-0.402236,-0.050194,-0.038252,0.008293,0.032042,-0.012174,-0.264430,-0.020411,0.189326,0.821420,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2520,0.070721,0.005766,0.004883,0.049122,0.025552,0.022769,0.010133,-0.021542,-0.035273,0.878505,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2521,0.049985,-0.054427,-0.027161,0.073265,0.026396,0.027258,0.010282,-0.016964,0.722721,0.199372,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2522,0.056316,-0.031990,-0.014948,0.045729,0.024707,0.026827,0.001959,-0.014463,-0.073942,1.617246,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [191]:
temp_df=  pd.DataFrame()
temp_df['val'] = train_df.drop(columns = ['winner']).sum(axis = 1)
temp_df['prediction'] = train_df.winner
temp_df.groupby('prediction').agg({'val':['mean','min','max', 'size']})

Unnamed: 0_level_0,val,val,val,val
Unnamed: 0_level_1,mean,min,max,size
prediction,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1.0,-0.447045,-103.225227,93.653801,1238
2.0,0.479865,-86.829826,108.468056,1280


In [215]:
train_df.dropna(inplace=True)

train_df.columns = train_df.columns.astype(str)
test_df = train_df.sample(int(train_df.shape[0]*0.1))


X = train_df[~train_df.index.isin(test_df.index)].drop('winner', axis=1)
y = train_df[~train_df.index.isin(test_df.index)]['winner'] - 1  # Adjusting target to 0-based



# X = train_df.drop('winner', axis=1)
# y = train_df['winner'] - 1  # Adjusting target to 0-based


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_test = test_df.drop('winner', axis=1)
y_test = test_df['winner'] - 1


In [216]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# prepare the cross-validation procedure
cv = KFold(n_splits=5, random_state=1, shuffle=True)
# create model
model = LogisticRegression()
# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))


model = RandomForestClassifier(n_estimators=100)
# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.545 (0.023)


In [217]:
# prepare the cross-validation procedure
cv = KFold(n_splits=5, random_state=1, shuffle=True)
# create model
model = RandomForestClassifier(n_estimators=100)
# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.610 (0.011)


In [218]:
# prepare the cross-validation procedure
cv = KFold(n_splits=5, random_state=1, shuffle=True)
# create model
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.594 (0.014)


In [184]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

In [185]:
# prepare the cross-validation procedure
cv = KFold(n_splits=5, random_state=1, shuffle=True)
# create model
model = AdaBoostClassifier()
# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.591 (0.013)


In [176]:
# prepare the cross-validation procedure
cv = KFold(n_splits=5, random_state=1, shuffle=True)
# create model
model = GaussianProcessClassifier()
# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.555 (0.012)


In [186]:
from sklearn.neural_network import MLPClassifier

# prepare the cross-validation procedure
cv = KFold(n_splits=5, random_state=1, shuffle=True)
# create model
model = MLPClassifier()
# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.601 (0.016)


In [205]:


# Initialize the models
lr = LogisticRegression(max_iter=1000)
rf = RandomForestClassifier(n_estimators=100)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Train the models
lr.fit(X_train, y_train)
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)


# Predictions
lr_predictions = lr.predict(X_val)
rf_predictions = rf.predict(X_val)
xgb_predictions = xgb.predict(X_val)

# Evaluation
from sklearn.metrics import classification_report
print("Logistic Regression:\n", classification_report(y_val, lr_predictions))
print("Random Forest:\n", classification_report(y_val, rf_predictions))
print("XGBoost:\n", classification_report(y_val, xgb_predictions))


Logistic Regression:
               precision    recall  f1-score   support

         0.0       0.54      0.55      0.55       216
         1.0       0.59      0.58      0.59       238

    accuracy                           0.57       454
   macro avg       0.57      0.57      0.57       454
weighted avg       0.57      0.57      0.57       454

Random Forest:
               precision    recall  f1-score   support

         0.0       0.55      0.59      0.57       216
         1.0       0.60      0.57      0.59       238

    accuracy                           0.58       454
   macro avg       0.58      0.58      0.58       454
weighted avg       0.58      0.58      0.58       454

XGBoost:
               precision    recall  f1-score   support

         0.0       0.55      0.59      0.57       216
         1.0       0.60      0.56      0.58       238

    accuracy                           0.57       454
   macro avg       0.57      0.57      0.57       454
weighted avg       0.57   

In [40]:
data_embed = ','.join(df.loc[1030]['player_2_units'])
data_embed = get_embedding(data_embed, model=model)

In [59]:
rf_preds = rf.predict(X_test)+1

In [60]:
test_df['rf_pred'] = rf_preds

In [56]:
from sklearn.metrics import classification_report

lr_predictions = lr.predict(X_test)


print("Logistic Regression:\n", classification_report(y_test, lr_predictions))
# print("Random Forest:\n", classification_report(y_test, rf_predictions))
# print("XGBoost:\n", classification_report(y_test, xgb_predictions))

Logistic Regression:
               precision    recall  f1-score   support

         0.0       0.63      0.29      0.40       136
         1.0       0.49      0.80      0.61       116

    accuracy                           0.53       252
   macro avg       0.56      0.55      0.51       252
weighted avg       0.57      0.53      0.50       252

