In [2]:
import json
import numpy as np
from numpy import mean
from numpy import std
import os
from pathlib import Path
import pandas as pd
from ast import literal_eval
from collections import Counter

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder
from category_encoders import TargetEncoder
from sklearn.feature_extraction import FeatureHasher
from sentence_transformers import SentenceTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from scipy.stats import randint, uniform, loguniform
import mlflow



import matplotlib.pyplot as plt
import math
import seaborn as sns 
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 8})

  from .autonotebook import tqdm as notebook_tqdm


### Helper Functions

In [3]:
def convert_array(arr):
    return np.array(arr)

def parse_and_count_units(unit_str):
    unit_counts = Counter(unit_str)
    return unit_counts

def get_metrics(y_test, y_pred):
    clf_rep = metrics.precision_recall_fscore_support(y_test, y_pred)
    accuracy = metrics.accuracy_score(y_pred=y_pred, y_true=y_test)
    out_dict = {
                "precision 1" :clf_rep[0][0].round(2)
                ,"precision 2" :clf_rep[0][1].round(2)
                ,"recall 1" : clf_rep[1][0].round(2)
                ,"recall 2" : clf_rep[1][1].round(2)
                ,"f1-score 1" : clf_rep[2][0].round(2)
                ,"f1-score 2" : clf_rep[2][1].round(2)
                , "accuracy" : round(accuracy, 2)
                }
    
    return out_dict

VESPENE_UNITS = ["Assimilator", "Extractor", "Refinery"]

SUPPLY_UNITS = ["Overlord", "Overseer", "Pylon", "SupplyDepot"]

WORKER_UNITS = ["Drone", "Probe", "SCV", "MULE"]

BASE_UNITS = ["CommandCenter", "Nexus", "Hatchery", "Lair", "Hive", "PlanetaryFortress", "OrbitalCommand"]

GROUND_UNITS = ["Barracks", "Factory", "GhostAcademy", "Armory", "RoboticsBay", "RoboticsFacility", "TemplarArchive",
                "DarkShrine", "WarpGate", "SpawningPool", "RoachWarren", "HydraliskDen", "BanelingNest", "UltraliskCavern",
                "LurkerDen", "InfestationPit"]

AIR_UNITS = ["Starport", "FusionCore", "RoboticsFacility", "Stargate", "FleetBeacon", "Spire", "GreaterSpire"]

TECH_UNITS = ["EngineeringBay", "Armory", "GhostAcademy", "TechLab", "FusionCore", "Forge", "CyberneticsCore",
              "TwilightCouncil", "RoboticsFacility", "RoboticsBay", "FleetBeacon", "TemplarArchive", "DarkShrine",
              "SpawningPool", "RoachWarren", "HydraliskDen", "BanelingNest", "UltraliskCavern", "LurkerDen", "Spire",
              "GreaterSpire", "EvolutionChamber", "InfestationPit"]

ARMY_UNITS = ["Marine", "Colossus", "InfestorTerran", "Baneling", "Mothership", "MothershipCore", "Changeling", "SiegeTank", "Viking", "Reaper",
              "Ghost", "Marauder", "Thor", "Hellion", "Hellbat", "Cyclone", "Liberator", "Medivac", "Banshee", "Raven", "Battlecruiser", "Nuke", "Zealot",
              "Stalker", "HighTemplar", "Disruptor", "DarkTemplar", "Sentry", "Phoenix", "Carrier", "Oracle", "VoidRay", "Tempest", "WarpPrism", "Observer",
              "Immortal", "Adept", "Zergling", "Overlord", "Hydralisk", "Mutalisk", "Ultralisk", "Roach", "Infestor", "Corruptor",
              "BroodLord", "Queen", "Overseer", "Archon", "Broodling", "InfestedTerran", "Ravager", "Viper", "SwarmHost"]

ARMY_AIR = ["Mothership", "MothershipCore", "Viking", "Liberator", "Medivac", "Banshee", "Raven", "Battlecruiser",
            "Viper", "Mutalisk", "Phoenix", "Oracle", "Carrier", "VoidRay", "Tempest", "Observer", "WarpPrism", "BroodLord",
            "Corruptor", "Observer", "Overseer"]


def count_unit_type(player_units):
    count_dict = {}
    unit_types = ['SUPPLY_UNITS','WORKER_UNITS','ARMY_UNITS','ARMY_AIR', 'VESPENE_UNITS', 'TECH_UNITS', 'GROUND_UNITS', 'AIR_UNITS']

    for unit_type in unit_types:
        count_dict[unit_type] = 0
        
    for unit in player_units.keys():
        if unit in VESPENE_UNITS: 
            count_dict['VESPENE_UNITS'] = count_dict['VESPENE_UNITS'] + player_units[unit]
        if unit in AIR_UNITS:
            count_dict['AIR_UNITS'] = count_dict['AIR_UNITS'] + player_units[unit]
        
        if unit in TECH_UNITS:
            count_dict['TECH_UNITS'] = count_dict['TECH_UNITS'] + player_units[unit]
            
        if unit in GROUND_UNITS:
            count_dict['GROUND_UNITS'] = count_dict['GROUND_UNITS'] + player_units[unit]

        if unit in SUPPLY_UNITS:
            count_dict['SUPPLY_UNITS'] = count_dict['SUPPLY_UNITS'] + player_units[unit]

        if unit in WORKER_UNITS:
            count_dict['WORKER_UNITS'] = count_dict['WORKER_UNITS'] + player_units[unit]

        if unit in ARMY_UNITS:
            count_dict['ARMY_UNITS'] = count_dict['ARMY_UNITS'] + player_units[unit]

        if unit in ARMY_AIR:
            count_dict['ARMY_AIR'] = count_dict['ARMY_AIR'] + player_units[unit]
            
    # total = sum(count_dict.values())
    # if count_dict['ARMY_UNITS'] != 0: count_dict['ARMY_UNITS'] = round(count_dict['ARMY_UNITS']/total, 2)
    # if count_dict['WORKER_UNITS'] != 0: count_dict['WORKER_UNITS'] = round(count_dict['WORKER_UNITS']/total, 2)

    # if count_dict['SUPPLY_UNITS'] != 0: count_dict['SUPPLY_UNITS'] =round(count_dict['SUPPLY_UNITS']/total, 2)
    # if count_dict['ARMY_AIR'] != 0: count_dict['ARMY_AIR'] = round(count_dict['ARMY_AIR']/total, 2)
    return count_dict


def map_filter(value):
    map_names = ['Romanticide LE', 'Oxide LE', 'Lightshade LE', '2000 Atmospheres LE',
       'Jagannatha LE', '[ESL] Data-C', '[ESL] Inside and Out', 'Deathaura LE',
       'Pillars of Gold LE', 'Blackburn LE', '[ESL] Cosmic Sapphire',
       'Nightshade LE', 'Eternal Empire LE', '[ESL] Moondance',
       '[ESL] Tropical Sacrifice', 'Data-C', 'Inside and Out', 'Simulacrum LE',
       '[ESL] Waterfall', 'Moondance', 'Ephemeron LE', 'Triton LE', 'Zen LE',
       'Ever Dream LE', 'Cosmic Sapphire', '[ESL] Stargazers',
       'Beckett Industries LE', 'Tropical Sacrifice', 'Submarine LE']
    
    if value in map_names: return value
    else: return 'others'


## Loading Data

In [5]:
# df = pd.read_json('data-old/replay_summaries.json')
df = pd.read_json('data/new_data.json')
df.head()

Unnamed: 0,map,player_1_units,player_1,player_2_units,player_2,winner,build,path,p1_embedding,p2_embedding
0,Oxide LE,"[BeaconArmy, BeaconDefend, BeaconAttack, Beaco...",Trap,"[BeaconArmy, BeaconDefend, BeaconAttack, Beaco...",DPGherO,2.0,84643,ASUS ROG Fall 2021 Replays\Group Stage\Group A...,"[0.0102319783, -0.0896995589, -0.0315713435, 0...","[0.0056394353000000005, -0.0848710686, -0.0358..."
1,2000 Atmospheres LE,"[BeaconArmy, BeaconDefend, BeaconAttack, Beaco...",DPGherO,"[BeaconArmy, BeaconDefend, BeaconAttack, Beaco...",Trap,2.0,84643,ASUS ROG Fall 2021 Replays\Group Stage\Group A...,"[-0.0059522321000000005, -0.0872044638, -0.033...","[0.0066874381, -0.0913076401, -0.029424306, 0...."
2,Romanticide LE,"[BeaconArmy, BeaconDefend, BeaconAttack, Beaco...",DPGherO,"[BeaconArmy, BeaconDefend, BeaconAttack, Beaco...",Trap,2.0,84643,ASUS ROG Fall 2021 Replays\Group Stage\Group A...,"[0.0299470052, -0.11222403500000001, -0.022741...","[0.0110502336, -0.10613112150000001, -0.018807..."
3,Oxide LE,"[BeaconArmy, BeaconDefend, BeaconAttack, Beaco...",HeroMarine,"[BeaconArmy, BeaconDefend, BeaconAttack, Beaco...",DPGZest,2.0,84643,ASUS ROG Fall 2021 Replays\Group Stage\Group A...,"[0.0054325555, -0.0752304047, -0.0400757939000...","[0.0397706293, -0.1149999201, -0.0086327605, 0..."
4,Romanticide LE,"[BeaconArmy, BeaconDefend, BeaconAttack, Beaco...",DPGZest,"[BeaconArmy, BeaconDefend, BeaconAttack, Beaco...",HeroMarine,1.0,84643,ASUS ROG Fall 2021 Replays\Group Stage\Group A...,"[0.0382621773, -0.1146185249, -0.0089894664, 0...","[0.0048489594, -0.0885644034, -0.0402806252, -..."


## Data Cleaning

In [6]:
df.dropna(inplace=True)
df = df[df['map'] != 'TEST__DOCUMENT']
df['player_1'] = df['player_1'].str.replace('&lt;', '<').str.replace('&gt;', '>').str.replace('<sp/>', '')
df['player_2'] = df['player_2'].str.replace('&lt;', '<').str.replace('&gt;', '>').str.replace('<sp/>', '')
# df.drop(['p1_embedding', 'p2_embedding'], axis=1, inplace=True)
df['p1_embedding'] = df['p1_embedding'].apply(lambda x: convert_array(x))
df['p2_embedding'] = df['p2_embedding'].apply(lambda x: convert_array(x))

## Feature Engineering

In [7]:
df['p1_unique_units'] = [len(set(x)) for x in df['player_1_units'].values]
df['p2_unique_units'] = [len(set(x)) for x in df['player_2_units'].values]
df['unique_diff'] = df['p1_unique_units'] - df['p2_unique_units']


df['p1_unit_count'] = [len(x) for x in df['player_1_units'].values]
df['p2_unit_count'] = [len(x) for x in df['player_2_units'].values]
df['unit_count_diff'] = df['p1_unit_count'] - df['p2_unit_count']

In [8]:
player_1_units_counts = df['player_1_units'].apply(parse_and_count_units)
player_2_units_counts = df['player_2_units'].apply(parse_and_count_units)

p1_unit_types = pd.json_normalize(player_1_units_counts.apply(count_unit_type))
p1_unit_types.columns = [f'p1_{col}' for col in p1_unit_types]

p2_unit_types = pd.json_normalize(player_2_units_counts.apply(count_unit_type))
p2_unit_types.columns = [f'p2_{col}' for col in p2_unit_types]

df[p1_unit_types.columns] = p1_unit_types.values
df[p2_unit_types.columns] = p2_unit_types.values


In [9]:
create_unit_cols = ['InvisibleTargetDummy', 'Larva', 'Zergling', 'BroodlingEscort', 'Drone', 'Broodling', 'Baneling', 'CreepTumorBurrowed']

player_1_units_counts = df['player_1_units'].apply(parse_and_count_units)
player_2_units_counts = df['player_2_units'].apply(parse_and_count_units)

p1_c = pd.json_normalize(player_1_units_counts).fillna(0)
p1_c = p1_c[[col for col in create_unit_cols if col in p1_c.columns]]
p1_c.columns = [f'p1_{col}' for col in p1_c.columns]

p2_c = pd.json_normalize(player_2_units_counts).fillna(0)
p2_c = p2_c[[col for col in create_unit_cols if col in p2_c.columns]]
p2_c.columns = [f'p2_{col}' for col in p2_c.columns]

p_unit_counts = pd.concat([p1_c, p2_c], axis=1).fillna(0)
df[p_unit_counts.columns] = p_unit_counts.values



In [10]:
df['map'] = df['map'].apply(map_filter)    

In [11]:
train_cols = ['map','winner', 'build' , 'p1_unique_units', 'p2_unique_units', 'unique_diff', 'p1_unit_count', 'p2_unit_count', 'unit_count_diff', 
'p1_SUPPLY_UNITS', 'p1_WORKER_UNITS', 'p1_ARMY_UNITS', 'p1_ARMY_AIR', 'p1_VESPENE_UNITS', 'p1_TECH_UNITS', 'p1_GROUND_UNITS', 'p1_AIR_UNITS', 
'p2_SUPPLY_UNITS', 'p2_WORKER_UNITS', 'p2_ARMY_UNITS', 'p2_ARMY_AIR', 'p2_VESPENE_UNITS','p2_TECH_UNITS', 'p2_GROUND_UNITS', 'p2_AIR_UNITS',
'p1_InvisibleTargetDummy', 'p1_Larva', 'p1_Zergling','p1_BroodlingEscort', 'p1_Drone', 'p1_Broodling', 'p1_Baneling','p1_CreepTumorBurrowed', 
'p2_InvisibleTargetDummy', 'p2_Larva', 'p2_Zergling', 'p2_BroodlingEscort', 'p2_Drone', 'p2_Broodling', 'p2_Baneling', 'p2_CreepTumorBurrowed']

df[train_cols]

Unnamed: 0,map,winner,build,p1_unique_units,p2_unique_units,unique_diff,p1_unit_count,p2_unit_count,unit_count_diff,p1_SUPPLY_UNITS,...,p1_Baneling,p1_CreepTumorBurrowed,p2_InvisibleTargetDummy,p2_Larva,p2_Zergling,p2_BroodlingEscort,p2_Drone,p2_Broodling,p2_Baneling,p2_CreepTumorBurrowed
0,Oxide LE,2.0,84643,29,25,4,77,80,-3,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2000 Atmospheres LE,2.0,84643,31,32,-1,110,125,-15,9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Romanticide LE,2.0,84643,33,32,1,103,124,-21,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Oxide LE,2.0,84643,39,42,-3,292,257,35,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Romanticide LE,1.0,84643,32,35,-3,140,191,-51,8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4440,Pillars of Gold LE,2.0,82893,9,6,3,1132,139,993,19,...,77.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4441,Romanticide LE,2.0,82893,18,8,10,2643,127,2516,39,...,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4442,Jagannatha LE,1.0,82893,10,8,2,555,77,478,28,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4443,Oxide LE,1.0,82893,9,6,3,391,65,326,19,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
df['embed_dif'] = df['p1_embedding'] - df['p2_embedding']
embedding_df = pd.DataFrame(df['embed_dif'].tolist(), index= df.index)
embedding_df.columns = [str(col) for col in embedding_df.columns]
# df = pd.concat([df.drop('embeddings', axis=1), embedding_df], axis=1)

In [52]:
embedding_df.shape

(4439, 384)

In [12]:
final_df = df[train_cols].copy()
final_df = final_df.sample(frac=1).reset_index(drop=True)
# final_df = pd.concat([final_df, embedding_df], axis=1)

# final_df[embedding_df.columns] = embedding_df.values

# Modeling

## Train Test Split

In [87]:
X = final_df.drop(['winner'], axis=1)
y = final_df['winner'] - 1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

selected_features = ['p2_unique_units', 'unique_diff', 'unit_count_diff', 'p1_SUPPLY_UNITS',
       'p1_ARMY_UNITS', 'p1_AIR_UNITS', 'p2_WORKER_UNITS', 'p2_ARMY_UNITS',
       'p2_ARMY_AIR', 'p2_VESPENE_UNITS', 'p2_GROUND_UNITS',
       'p1_InvisibleTargetDummy', 'p1_BroodlingEscort', 'p1_Drone',
       'p1_Broodling', 'p1_Baneling', 'p1_CreepTumorBurrowed', 'p2_Zergling',
       'p2_BroodlingEscort', 'p2_Broodling']

rf_features = ['map', 'build', 'p1_unique_units', 'p2_unique_units', 'unique_diff', 'p1_unit_count', 'p2_unit_count', 'unit_count_diff', 'p1_SUPPLY_UNITS', 'p1_WORKER_UNITS', 'p1_ARMY_UNITS', 'p1_ARMY_AIR', 'p1_VESPENE_UNITS', 'p1_TECH_UNITS', 'p1_GROUND_UNITS', 'p1_AIR_UNITS', 'p2_SUPPLY_UNITS', 'p2_WORKER_UNITS', 'p2_ARMY_UNITS', 'p2_ARMY_AIR']
X_train = X_train[selected_features]
X_test = X_test[selected_features]

# X_train = X_train[rf_features]
# X_test = X_test[rf_features]

In [88]:
numerical_data = X_train.select_dtypes(include='number')
numerical_features=numerical_data.columns.tolist()

print(f'There are {len(numerical_features)} numerical features:', '\n')
print(numerical_features, '\n')

#Selecting categoricalfeatures
categorical_data=X_train.select_dtypes(include= 'object')
categorical_features=categorical_data.columns.tolist()

print(f'There are {len(categorical_features)} numerical features:', '\n')
print(categorical_features)

There are 20 numerical features: 

['p2_unique_units', 'unique_diff', 'unit_count_diff', 'p1_SUPPLY_UNITS', 'p1_ARMY_UNITS', 'p1_AIR_UNITS', 'p2_WORKER_UNITS', 'p2_ARMY_UNITS', 'p2_ARMY_AIR', 'p2_VESPENE_UNITS', 'p2_GROUND_UNITS', 'p1_InvisibleTargetDummy', 'p1_BroodlingEscort', 'p1_Drone', 'p1_Broodling', 'p1_Baneling', 'p1_CreepTumorBurrowed', 'p2_Zergling', 'p2_BroodlingEscort', 'p2_Broodling'] 

There are 0 numerical features: 

[]


## Data Preprocessing

### Standard Scaling

In [89]:
scaler = StandardScaler()
remove = [str(i) for i in range(384)]
scale_cols = list(set(numerical_features) - set(remove))
scaler.fit(X_train[scale_cols])
X_train[scale_cols] = scaler.transform(X_train[scale_cols])
X_test[scale_cols] = scaler.transform(X_test[scale_cols])

In [229]:
print(scale_cols)

['p2_Zergling', 'p2_BroodlingEscort', 'p1_GROUND_UNITS', 'p1_unique_units', 'p2_ARMY_AIR', 'p2_unit_count', 'p1_BroodlingEscort', 'p1_AIR_UNITS', 'p1_unit_count', 'p1_Broodling', 'p2_AIR_UNITS', 'p2_VESPENE_UNITS', 'p2_unique_units', 'p2_GROUND_UNITS', 'p1_SUPPLY_UNITS', 'p2_WORKER_UNITS', 'p1_ARMY_UNITS', 'p2_CreepTumorBurrowed', 'p1_Baneling', 'p1_Zergling', 'build', 'p2_TECH_UNITS', 'p2_Larva', 'p1_InvisibleTargetDummy', 'unique_diff', 'p2_ARMY_UNITS', 'p1_ARMY_AIR', 'p2_InvisibleTargetDummy', 'unit_count_diff', 'p2_Drone', 'p2_Baneling', 'p1_CreepTumorBurrowed', 'p1_WORKER_UNITS', 'p1_TECH_UNITS', 'p1_Larva', 'p1_VESPENE_UNITS', 'p1_Drone', 'p2_Broodling', 'p2_SUPPLY_UNITS']


### Robust Scaling

In [167]:
scaler = RobustScaler()
remove = [str(i) for i in range(384)]
scale_cols = list(set(numerical_features) - set(remove))
scaler.fit(X_train[scale_cols])
X_train[scale_cols] = scaler.transform(X_train[scale_cols])
X_test[scale_cols] = scaler.transform(X_test[scale_cols])

### Ohe Hot Encoding

In [16]:

ohe = OneHotEncoder(sparse_output=False)
map_encoded = ohe.fit_transform(X_train[['map']])
map_encoded_df = pd.DataFrame(map_encoded, columns=ohe.get_feature_names_out(['map']), index=X_train.index)
X_train = pd.concat([X_train.drop('map', axis=1), map_encoded_df], axis=1)

map_encoded = ohe.transform(X_test[['map']])
map_encoded_df = pd.DataFrame(map_encoded, columns=ohe.get_feature_names_out(['map']), index=X_test.index)
X_test = pd.concat([X_test.drop('map', axis=1), map_encoded_df], axis=1)

In [90]:
encoder = TargetEncoder(cols = ['map'])
encoder.fit(X_train['map'], y_train.values)

X_train['map'] = encoder.transform(X_train['map'])
X_test['map'] = encoder.transform(X_test['map'])

KeyError: 'map'

# MLFLOW Experiment

In [24]:


param_distributions = {
    'LogisticRegression': {
        'C': loguniform(1e-4, 1e2),
        'penalty': ['l2', 'none'],
        'solver': ['lbfgs', 'sag', 'saga'],
        'l1_ratio': uniform(0, 1)  # Only used with 'saga' solver and 'elasticnet' penalty
    },
    'RandomForestClassifier': {
        'n_estimators': randint(50, 200),
        'max_depth': randint(3, 20),
        'min_samples_split': randint(2, 20),
        'min_samples_leaf': randint(1, 20)
    },
    'SVC': {
        'C': loguniform(1e-2, 10),
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'gamma': ['scale', 'auto']
    },
    'XGBClassifier': {
        'n_estimators': randint(50, 200),
        'max_depth': randint(3, 10),
        'learning_rate': uniform(0.01, 0.3),
        'subsample': uniform(0.5, 1)
    },
    'DecisionTreeClassifier' : {
        'criterion': ['gini', 'entropy'],
        'max_depth': randint(3, 10)
    },
    'MLPClassifier': {
        # 'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],  # Example layer sizes
        'hidden_layer_sizes': [ (100,), (200,), (300, )],  # Example layer sizes
        'activation': ['relu', 'tanh', 'logistic'],
        'alpha': loguniform(1e-4, 1e-2),
        'learning_rate_init': loguniform(1e-4, 1e-2)
    },
    'AdaBoostClassifier': {
        'n_estimators': randint(50, 200),
        'learning_rate': uniform(0.01, 1),
        # Optionally, set base_estimator if you want to experiment with other than the default
    },
    'KNeighborsClassifier': {
        'n_neighbors': randint(1, 10),
        'weights': ['uniform', 'distance'],
    }
}

In [91]:

experiment_name = "Feature Selection - 20 Cols"


classfication_models = [XGBClassifier, RandomForestClassifier, LogisticRegression]
# classfication_models = [SVC, MLPClassifier]
# LogisticRegression, RandomForestClassifier, SVC, XGBClassifier,MLPClassifier, XGBClassifier

mlflow.set_tracking_uri('http://localhost:5555')
mlflow.set_experiment(experiment_name=experiment_name)


for model_class in classfication_models:
    model_name = model_class.__name__
    print(model_name)
    with mlflow.start_run(run_name = model_name):
        model = model_class()
        param_dict = param_distributions[model_name]
        
        random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dict, n_iter=10, cv=5, verbose=1, random_state=42, n_jobs=-1)
        
        random_search.fit(X_train, y_train)
        y_pred = random_search.predict(X_test)
        
        ml_metrics = get_metrics(y_test, y_pred)
        mlflow.log_metrics(ml_metrics)
        for param_name, param_value in random_search.best_params_.items():
            mlflow.log_param(f'{param_name}', param_value)
    

XGBClassifier
Fitting 5 folds for each of 10 candidates, totalling 50 fits
RandomForestClassifier
Fitting 5 folds for each of 10 candidates, totalling 50 fits
LogisticRegression
Fitting 5 folds for each of 10 candidates, totalling 50 fits


## Feature Importance

In [25]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS


In [52]:
# step forward feature selection
sfs1 = SFS(RandomForestClassifier(), 
           k_features=20, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=3)

sfs1 = sfs1.fit(np.array(X_train), y_train)

[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:   23.8s
[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:   23.8s

[2024-04-17 03:31:25] Features: 1/20 -- score: 0.5329168387019626
[2024-04-17 03:31:53] Features: 2/20 -- score: 0.546434112549815
[2024-04-17 03:32:27] Features: 3/20 -- score: 0.5539399279068701
[2024-04-17 03:33:04] Features: 4/20 -- score: 0.5669574533210896
[2024-04-17 03:33:39] Features: 5/20 -- score: 0.5712139637759472
[2024-04-17 03:34:12] Features: 6/20 -- score: 0.5752190958802529
[2024-04-17 03:34:47] Features: 7/20 -- score: 0.5747159631457152
[2024-04-17 03:35:20] Features: 8/20 -- score: 0.5802224087348055
[2024-04-17 03:35:54] Features: 9/20 -- score: 0.5909861777630373
[2024-04-17 03:36:28] Features: 10/20 -- score: 0.5912384961971738
[2024-04-17 03:37:09] Features: 11/20 -- score: 0.6095124049669504
[2024-04-17 03:37:47] Features: 12/20 -- score: 0.6332910671753647
[2024-04-17 03:38:23] Features: 13/20 -- score: 0.633792319742733
[2024-04-17 03:38

In [53]:
X_train.columns[list(sfs1.k_feature_idx_)]

Index(['p2_unique_units', 'unique_diff', 'unit_count_diff', 'p1_SUPPLY_UNITS',
       'p1_ARMY_UNITS', 'p1_AIR_UNITS', 'p2_WORKER_UNITS', 'p2_ARMY_UNITS',
       'p2_ARMY_AIR', 'p2_VESPENE_UNITS', 'p2_GROUND_UNITS',
       'p1_InvisibleTargetDummy', 'p1_BroodlingEscort', 'p1_Drone',
       'p1_Broodling', 'p1_Baneling', 'p1_CreepTumorBurrowed', 'p2_Zergling',
       'p2_BroodlingEscort', 'p2_Broodling'],
      dtype='object')

In [134]:
model = MLPClassifier(activation='relu', alpha=0.00040596, hidden_layer_sizes=(100,), learning_rate_init=0.0001, )

model.fit(X_train, y_train)

Index(['unique_diff', 'p2_unit_count', 'unit_count_diff', 'p1_SUPPLY_UNITS',
       'p2_SUPPLY_UNITS', 'p2_ARMY_UNITS', 'p2_GROUND_UNITS', 'p1_Broodling',
       'p2_Broodling', 'map_Beckett Industries LE'],
      dtype='object')

In [67]:
X_train

Unnamed: 0,map,build,p1_unique_units,p2_unique_units,unique_diff,p1_unit_count,p2_unit_count,unit_count_diff,p1_SUPPLY_UNITS,p1_WORKER_UNITS,...,p1_Baneling,p1_CreepTumorBurrowed,p2_InvisibleTargetDummy,p2_Larva,p2_Zergling,p2_BroodlingEscort,p2_Drone,p2_Broodling,p2_Baneling,p2_CreepTumorBurrowed
1315,0.498846,0.057685,-0.767652,-0.925659,0.854285,-0.460748,-0.341108,-0.077716,-0.931487,-0.959637,...,-0.273518,-0.214145,-0.099795,-0.198968,-0.268146,-0.043843,0.279476,-0.056407,-0.217372,-0.238895
180,0.478271,1.324169,0.744789,0.610186,0.350838,-0.161110,-0.001926,-0.119905,0.289532,-0.323275,...,-0.273518,-0.092477,-0.099795,0.585971,0.314864,-0.043843,0.537856,-0.056407,-0.217372,-0.182958
432,0.480916,0.057685,0.942064,0.610186,1.106009,-0.363613,-0.198212,-0.117423,-0.367940,-0.868728,...,-0.273518,-0.214145,-0.099795,-0.331368,-0.268146,-0.043843,-0.573175,-0.056407,-0.217372,-0.238895
1187,0.559210,1.324169,0.547514,0.917355,-1.662951,-0.421235,-0.223337,-0.140999,-0.086166,-0.989940,...,-0.273518,-0.214145,-0.099795,-0.331368,-0.268146,-0.043843,-0.573175,-0.056407,-0.217372,-0.238895
2341,0.587143,-1.723010,-0.964927,-0.987093,0.350838,-0.259892,-0.353671,0.083593,0.195607,-1.020243,...,-0.273518,-0.214145,-0.099795,-0.331368,-0.268146,-0.043843,-0.573175,-0.056407,-0.217372,-0.238895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4426,0.379601,1.324169,1.533889,1.593126,-0.656057,-0.024462,1.252734,-1.008347,1.040928,0.434300,...,-0.273518,-0.214145,-0.099795,2.988073,2.440033,-0.043843,2.992459,0.225997,2.449339,2.669788
466,0.485507,0.057685,1.468130,1.101656,1.106009,0.401945,0.439325,-0.044213,1.604475,0.585814,...,-0.273518,-0.214145,-0.099795,-0.331368,-0.268146,-0.043843,-0.573175,-0.056407,-0.217372,-0.238895
3092,0.469029,-1.723010,-0.899169,-0.741357,-0.404333,-0.472273,-0.268875,-0.143481,-0.931487,-0.292972,...,-0.273518,-0.214145,-0.099795,-0.331368,-0.268146,-0.043843,-0.573175,-0.056407,-0.217372,-0.238895
3772,0.515337,1.137921,1.073580,0.794487,0.854285,-0.320807,0.467590,-0.611278,0.665230,-1.020243,...,-0.273518,-0.214145,-0.099795,1.597880,1.123557,-0.043843,1.261318,-0.056407,-0.217372,1.830745


In [63]:
model = RandomForestClassifier(max_depth=11, min_samples_split=19, min_samples_leaf=7, n_estimators=181, random_state=42)

model.fit(X_train, y_train)



In [65]:
model.feature_importances_

array([0.04672883, 0.01669553, 0.0334572 , 0.02528194, 0.05246091,
       0.046811  , 0.05804515, 0.08374101, 0.03279335, 0.04308015,
       0.04431207, 0.03132861, 0.02030932, 0.0162382 , 0.01993229,
       0.01201953, 0.03474457, 0.06166852, 0.05677463, 0.04093138,
       0.01578909, 0.00839006, 0.01855077, 0.0077735 , 0.00022648,
       0.01448256, 0.01303677, 0.00135208, 0.01333686, 0.02149395,
       0.0084417 , 0.00673144, 0.00090651, 0.01897409, 0.01507204,
       0.00150393, 0.02015397, 0.0178343 , 0.00841552, 0.01018015])

In [66]:
model.feature_names_in_

array(['map', 'build', 'p1_unique_units', 'p2_unique_units',
       'unique_diff', 'p1_unit_count', 'p2_unit_count', 'unit_count_diff',
       'p1_SUPPLY_UNITS', 'p1_WORKER_UNITS', 'p1_ARMY_UNITS',
       'p1_ARMY_AIR', 'p1_VESPENE_UNITS', 'p1_TECH_UNITS',
       'p1_GROUND_UNITS', 'p1_AIR_UNITS', 'p2_SUPPLY_UNITS',
       'p2_WORKER_UNITS', 'p2_ARMY_UNITS', 'p2_ARMY_AIR',
       'p2_VESPENE_UNITS', 'p2_TECH_UNITS', 'p2_GROUND_UNITS',
       'p2_AIR_UNITS', 'p1_InvisibleTargetDummy', 'p1_Larva',
       'p1_Zergling', 'p1_BroodlingEscort', 'p1_Drone', 'p1_Broodling',
       'p1_Baneling', 'p1_CreepTumorBurrowed', 'p2_InvisibleTargetDummy',
       'p2_Larva', 'p2_Zergling', 'p2_BroodlingEscort', 'p2_Drone',
       'p2_Broodling', 'p2_Baneling', 'p2_CreepTumorBurrowed'],
      dtype=object)

In [71]:
y_pred = model.predict(X_test)

In [72]:
get_metrics(y_pred, y_test.values)

{'precision 1': 0.77,
 'precision 2': 0.67,
 'recall 1': 0.68,
 'recall 2': 0.76,
 'f1-score 1': 0.73,
 'f1-score 2': 0.71,
 'accuracy': 0.72}

In [239]:



model_pkl_file = "models/model.pkl"  

with open(model_pkl_file, 'wb') as file:  
    pickle.dump(model, file)

model_pkl_file = "models/encoder.pkl"  

with open(model_pkl_file, 'wb') as file:  
    pickle.dump(ohe, file)
    
model_pkl_file = "models/scaler.pkl"  

with open(model_pkl_file, 'wb') as file:  
    pickle.dump(scaler, file)



In [16]:
def get_embedding(text, embeding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')):
    return embeding_model.encode(text)

# embeding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

embeding_model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-dot-v1')
# df['p1_embedding'] = df['player_1_units'].apply(lambda x: get_embedding(','.join(x), embeding_model=embeding_model))
# df['p2_embedding'] = df['player_2_units'].apply(lambda x: get_embedding(','.join(x), embeding_model=embeding_model))


In [20]:

# df['p1_embedding'] = df['player_1_units'].apply(lambda x: get_embedding(' '.join(x), embeding_model=embeding_model))
df['p2_embedding'] = df['player_2_units'].apply(lambda x: get_embedding(','.join(x), embeding_model=embeding_model))

## DNN Experiment

In [227]:
import tensorflow as tf 
from tensorflow import keras
from sklearn import preprocessing, model_selection
import numpy as np 
import pandas as pd 

In [228]:
callback = keras.callbacks.EarlyStopping(
    monitor="loss",
    min_delta=0,
    patience=0,
    verbose=0,
    mode="auto",
    baseline=None,
    restore_best_weights=True,
    start_from_epoch=0,
)

In [236]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(600,activation = tf.nn.relu))
model.add(tf.keras.layers.Dense(800,activation = tf.nn.tanh))
model.add(tf.keras.layers.Dense(300,activation = tf.nn.tanh))
model.add(tf.keras.layers.Dense(150,activation = tf.nn.tanh))
model.add(tf.keras.layers.Dense(100,activation = tf.nn.relu))
model.add(tf.keras.layers.Dense(2,activation = tf.nn.softmax))


model.compile(optimizer = tf.optimizers.RMSProp(),
       loss = 'sparse_categorical_crossentropy',
       metrics=['accuracy'])

model.fit(X_train,y_train, epochs = 100, batch_size = 5, shuffle=True)

AttributeError: module 'keras._tf_keras.keras.optimizers' has no attribute 'RMSProp'

In [None]:
tf.keras.

In [234]:
val_loss,val_acc = model.evaluate(X_test,y_test)
print("Loss % = {} , Accuracy % = {} ".format(val_loss*100,val_acc*100))

[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5436 - loss: 2.1343  
Loss % = 196.27939462661743 , Accuracy % = 56.75675868988037 


## PREDICT FROM API

In [40]:
import requests
import json


PREDICT_URL = 'http://localhost:5000/predict'

In [41]:
df.columns

Index(['map', 'player_1_units', 'player_1', 'player_2_units', 'player_2',
       'winner', 'build', 'path', 'p1_embedding', 'p2_embedding',
       'p1_unique_units', 'p2_unique_units', 'unique_diff', 'p1_unit_count',
       'p2_unit_count', 'unit_count_diff', 'p1_SUPPLY_UNITS',
       'p1_WORKER_UNITS', 'p1_ARMY_UNITS', 'p1_ARMY_AIR', 'p1_VESPENE_UNITS',
       'p1_TECH_UNITS', 'p1_GROUND_UNITS', 'p1_AIR_UNITS', 'p2_SUPPLY_UNITS',
       'p2_WORKER_UNITS', 'p2_ARMY_UNITS', 'p2_ARMY_AIR', 'p2_VESPENE_UNITS',
       'p2_TECH_UNITS', 'p2_GROUND_UNITS', 'p2_AIR_UNITS',
       'p1_InvisibleTargetDummy', 'p1_Larva', 'p1_Zergling',
       'p1_BroodlingEscort', 'p1_Drone', 'p1_Broodling', 'p1_Baneling',
       'p1_CreepTumorBurrowed', 'p2_InvisibleTargetDummy', 'p2_Larva',
       'p2_Zergling', 'p2_BroodlingEscort', 'p2_Drone', 'p2_Broodling',
       'p2_Baneling', 'p2_CreepTumorBurrowed'],
      dtype='object')

In [42]:
import requests
response = requests.post(PREDICT_URL, json=df[df.index.isin(X_test.index)].iloc[:][['map', 'player_1_units', 'player_1', 'player_2_units', 'player_2',
       'build', 'path']].to_dict('records'))

In [47]:
y_pred = json.loads(response.content)

In [49]:
get_metrics(y_pred, y_test.values)

{'precision 1': 0.52,
 'precision 2': 0.47,
 'recall 1': 0.47,
 'recall 2': 0.51,
 'f1-score 1': 0.49,
 'f1-score 2': 0.49,
 'accuracy': 0.49}

In [None]:
''