In [56]:
import pandas as pd
import json
import numpy as np
import os
from pathlib import Path

In [13]:
import json
import requests

def train_model_with_json(json_file_path, url):
    """
    Sends a JSON file to the specified training endpoint URL.

    :param json_file_path: Path to the JSON file to be sent for training.
    :param url: URL of the training endpoint.
    """
    # Load JSON data from the file
    with open(json_file_path, 'r') as file:
        data = json.load(file)
    
    # Send the data to the training endpoint
    response = requests.post(url, json=data)
    
    if response.status_code == 200:
        print("Training started successfully.")
        print(response.json())
    else:
        print(f"Failed to start training. Status code: {response.status_code}")
        print(response.text)

# Replace 'localhost:5000' with your Flask app's address and port
TRAIN_URL = 'http://localhost:5000/train'

# Path to your JSON file
JSON_FILE_PATH = 'data-old/replay_summaries.json'

train_model_with_json(JSON_FILE_PATH, TRAIN_URL)


Training started successfully.
{'message': 'Model trained and saved successfully.'}


In [16]:
TRAIN_URL = 'http://localhost:5000/update_model'
response = requests.get(TRAIN_URL)

In [63]:
df = pd.read_json('data/replay_summaries.json')

In [211]:
import sc2reader
replay = sc2reader.load_replay('replay_data/8fde74f1c54421fa2b059c9740142c9c.SC2Replay', load_map=True)
(list(replay.active_units.values()))
for tracker in replay.tracker_events:
    break

set([str(value).split(' [')[0] for value in list(replay.active_units.values())])
# replay = sc2reader.load_replay('replay_data/8fde74f1c54421fa2b059c9740142c9c.SC2Replay', load_level=3)

In [64]:
import s2protocol
import mpyq
archive = mpyq.MPQArchive('replay_data/8fde74f1c54421fa2b059c9740142c9c.SC2Replay')
contents = archive.header['user_data_header']['content']
from s2protocol import versions
header = versions.latest().decode_replay_header(contents)
header['m_replayCompatibilityHash']['m_data'].decode()

In [53]:
all_unit_1 = [] 
for unit in df['player_1_units']:
    all_unit_1.extend(unit)

In [58]:
pd.DataFrame(all_unit_1).value_counts()

0                 
Larva                 191419
Zergling              125132
Drone                  73384
Probe                  61363
SCV                    56124
                       ...  
Mothership                36
HellionTank               33
ChangelingZergling        19
Archon                    17
TL_GlobalCaster            2
Name: count, Length: 71, dtype: int64

In [2]:
import pandas as pd
from ast import literal_eval
from collections import Counter
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.feature_extraction import FeatureHasher
from sklearn.metrics import classification_report


In [6]:
def load_and_preprocess_data(json_data):
    data = pd.read_json(json_data)
    data.dropna(inplace=True)
    data['winner'] = data['winner'].astype(int)
    data['player_1'] = data['player_1'].str.replace('&lt;', '<').str.replace('&gt;', '>').str.replace('<sp/>', '')
    data['player_2'] = data['player_2'].str.replace('&lt;', '<').str.replace('&gt;', '>').str.replace('<sp/>', '')
    return data

def get_column_types(data):
    numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
    object_cols = data.select_dtypes(include='object').columns.tolist()
    return numeric_cols, object_cols

def parse_and_count_units(unit_str):
    unit_counts = Counter(unit_str)
    return unit_counts



def feature_engineering(df_cleaned):

    # Apply the function to both player_1_units and player_2_units columns
    player_1_units_counts = df_cleaned['player_1_units'].apply(parse_and_count_units)
    player_2_units_counts = df_cleaned['player_2_units'].apply(parse_and_count_units)

    # Combine all unit counts to identify all unique units in the dataset
    all_unit_counts = pd.concat([player_1_units_counts, player_2_units_counts])
    all_unique_units = set(unit for counts in all_unit_counts for unit in counts)

    player_1_cols = [f'player_1_{unit}' for unit in all_unique_units]
    player_2_cols = [f'player_2_{unit}' for unit in all_unique_units]
    new_cols_df = pd.DataFrame(0, index=df_cleaned.index, columns=player_1_cols + player_2_cols)
    df_cleaned = pd.concat([df_cleaned, new_cols_df], axis=1)

    # Populate the unit count columns for each player
    for index, row in df_cleaned.iterrows():
        for unit, count in player_1_units_counts.loc[index].items():
            df_cleaned.at[index, f'player_1_{unit}'] = count
        for unit, count in player_2_units_counts.loc[index].items():
            df_cleaned.at[index, f'player_2_{unit}'] = count

    # One-hot encode the 'map' variable
    ohe = OneHotEncoder(sparse_output=False)
    map_encoded = ohe.fit_transform(df_cleaned[['map']])
    map_encoded_df = pd.DataFrame(map_encoded, columns=ohe.get_feature_names_out(['map']), index=df_cleaned.index)
    # Drop the original 'map' column and concatenate the one-hot encoded map columns
    df_cleaned = pd.concat([df_cleaned.drop('map', axis=1), map_encoded_df], axis=1)

    # Scaling numerical features (excluding 'winner', 'build', and any string columns)
    numerical_cols = [col for col in df_cleaned.columns if df_cleaned[col].dtype in ['int64', 'float64'] and col not in ['winner', 'build']]
    scaler = StandardScaler()
    df_cleaned[numerical_cols] = scaler.fit_transform(df_cleaned[numerical_cols])

    # Feature hashing for 'player_1' and 'player_2' columns
    combined_players_list = df_cleaned[['player_1', 'player_2']].values.tolist()
    fh = FeatureHasher(n_features=10, input_type='string')
    hashed_features = fh.transform(combined_players_list).toarray()
    hashed_features_df = pd.DataFrame(hashed_features, columns=[f'player_hash_{i}' for i in range(10)], index=df_cleaned.index)
    df_cleaned_final = pd.concat([df_cleaned.drop(['player_1', 'player_2'], axis=1), hashed_features_df], axis=1)
    
    
    unit_columns = [col for col in df_cleaned_final.columns if 'player_1_' in col and col.replace('player_1_', 'player_2_') in df_cleaned_final.columns and col != 'player_1_units']
    for unit_col in unit_columns:
        
        player_2_col = unit_col.replace('player_1_', 'player_2_')
        differential_col = unit_col.replace('player_1_', 'diff_')
        df_cleaned_final[differential_col] = df_cleaned_final[unit_col] - df_cleaned_final[player_2_col]
    
    return df_cleaned_final


def train_model(X_train, y_train):
    lr = LogisticRegression(max_iter=1000)
    lr.fit(X_train, y_train)
    return lr

def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    print("Evaluation Report:\n", classification_report(y_test, predictions))


In [7]:
df = load_and_preprocess_data('data-old/replay_summaries.json')
df = feature_engineering(df)

In [1]:
import pickle


In [41]:
def get_column_types(data):
    numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
    object_cols = data.select_dtypes(include='object').columns.tolist()
    return numeric_cols, object_cols

def parse_and_count_units(unit_str):
    unit_counts = Counter(unit_str)
    return unit_counts

class Sc2Classifier(object):
    def __init__(self):
        self.model = LogisticRegression(max_iter=1000, random_state=42)
        self.ohe = OneHotEncoder(sparse_output=False)
        self.scaler = StandardScaler()
        self.fh = FeatureHasher(n_features=10, input_type='string')
        self.model_dir = 'models'
        
    def encoder_fit(self, X):
        self.ohe.fit(X)
    
    def encoder_transform(self, X):
        return self.ohe.transform(X)
        
        
    def scaler_fit(self, X):
        self.scaler.fit(X)
    
    def scaler_transform(self, X):
        self.scaler.transform(X)
        
    def hasher_fit(self, X):
        self.fh.fit(X)
    
    def hasher_transform(self, X):
        self.fh.transform(X)
        
        
    def train(self, X, y):
        self.model.fit(X, y)
    
    def predict_proba(self, X):
        """Returns probability for the binary class '1' in a numpy array
        """
        y_proba = self.model.predict_proba(X)
        return y_proba[:, 1]
    
    
    def predict(self, X):
        """Returns the predicted class in an array
        """
        y_pred = self.model.predict(X)
        return y_pred
    
    def pickle_preprocessors(self, path='./lib/models/CountVectorizer.pkl'):
        """save the trained vectorizer
        """
        scaler_path = f'{self.model_dir}/StandardScaler.pkl'
        with open(scaler_path, 'wb') as f:
            pickle.dump(self.scaler, f)
            print("Scaler saved at {}".format(scaler_path))
        
        ohe_path = f'{self.model_dir}/OneHotEncoder.pkl'
        with open(ohe_path, 'wb') as f:
            pickle.dump(self.ohe, f)
            print("Encoder saved at {}".format(ohe_path))
        
        hasher_path = f'{self.model_dir}/FeatureHasher.pkl'
        with open(hasher_path, 'wb') as f:
            pickle.dump(self.fh, f)
            print("Hasher saved at {}".format(hasher_path))

    def pickle_clf(self):
        """
        saves the trained classifer models
        """
        
        model_path = f'{self.model_dir}/model.pkl'
        with open(model_path, 'wb') as f:
            pickle.dump(self.model, f)
            print("Classifer saved at {}".format(model_path))

    def classification_report(self, X, y, target_names):
        """get a classification report for metrics
        """
        classification_report(X, y, target_names = target_names)

In [182]:
def process_train_data(data: pd.DataFrame):
    data = data.copy()
    data = data.dropna()
    data['winner'] = data['winner'].astype(int)
    
    return data

model = Sc2Classifier()
    
with open('data-old/replay_summaries.json', 'r') as f:
    data = pd.read_json(f)
    
data = process_train_data(data)

In [185]:
pred_df = pd.read_json(StringIO(json.dumps(data.drop('winner', axis=1).loc[[0]].to_dict('records'))))
# df_cleaned = data.copy()
df_cleaned = pred_df

In [191]:
json.dumps(data.drop('winner', axis=1).loc[[0]].to_dict('records'))

'[{"path": "ASUS ROG Online 2020\\\\ASUS_ROG_Online_2020_replays\\\\1 - Group Stage\\\\Group A\\\\20-11-27 14_31_15 - [Ex0n]MaxPax vs Rogue - Romanticide LE.SC2Replay", "total_gameloops": 24725, "gameloop": 23381, "build": 82457, "map": "Romanticide LE", "player_1": "&lt;Ex0n&gt;<sp/>MaxPax", "player_2": "Rogue", "player_1_units": ["Nexus", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Adept", "Probe", "Probe", "Probe", "Probe", "VoidRay", "Adept", "Probe", "Probe", "Probe", "Probe", "Probe", "Oracle", "Probe", "Probe", "Probe", "Probe", "Probe", "Oracle", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "VoidRay", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Immortal", "Probe", "Immortal", "Chang

In [186]:
# df_cleaned = data.copy()
# Apply the function to both player_1_units and player_2_units columns
player_1_units_counts = df_cleaned['player_1_units'].apply(parse_and_count_units)
player_2_units_counts = df_cleaned['player_2_units'].apply(parse_and_count_units)

# Combine all unit counts to identify all unique units in the dataset
all_unit_counts = pd.concat([player_1_units_counts, player_2_units_counts])
all_unique_units = set(unit for counts in all_unit_counts for unit in counts)

player_1_cols = [f'player_1_{unit}' for unit in all_unique_units]
player_2_cols = [f'player_2_{unit}' for unit in all_unique_units]
new_cols_df = pd.DataFrame(0, index=df_cleaned.index, columns=player_1_cols + player_2_cols)
df_cleaned = pd.concat([df_cleaned, new_cols_df], axis=1)

# Populate the unit count columns for each player
for index, row in df_cleaned.iterrows():
    for unit, count in player_1_units_counts.loc[index].items():
        df_cleaned.at[index, f'player_1_{unit}'] = count
    for unit, count in player_2_units_counts.loc[index].items():
        df_cleaned.at[index, f'player_2_{unit}'] = count

In [187]:
df_cleaned

Unnamed: 0,path,total_gameloops,gameloop,build,map,player_1,player_2,player_1_units,player_2_units,player_1_Colossus,...,player_2_Immortal,player_2_Disruptor,player_2_WarpPrism,player_2_Overlord,player_2_Zergling,player_2_Nexus,player_2_Hydralisk,player_2_Larva,player_2_Viper,player_2_ChangelingZealot
0,ASUS ROG Online 2020\ASUS_ROG_Online_2020_repl...,24725,23381,82457,Romanticide LE,&lt;Ex0n&gt;<sp/>MaxPax,Rogue,"[Nexus, Probe, Probe, Probe, Probe, Probe, Pro...","[Hatchery, Drone, Drone, Drone, Drone, Drone, ...",4,...,0,0,0,23,6,0,26,42,3,0


In [189]:
test_cleaned.head(1)

Unnamed: 0,path,total_gameloops,gameloop,build,winner,map,player_1,player_2,player_1_units,player_2_units,...,player_2_Nexus,player_2_Stalker,player_2_Larva,player_2_Hydralisk,player_2_Phoenix,player_2_Baneling,player_2_Viper,player_2_Raven,player_2_Marine,player_2_ChangelingZealot
0,ASUS ROG Online 2020\ASUS_ROG_Online_2020_repl...,24725,23381,82457,2,Romanticide LE,&lt;Ex0n&gt;<sp/>MaxPax,Rogue,"[Nexus, Probe, Probe, Probe, Probe, Probe, Pro...","[Hatchery, Drone, Drone, Drone, Drone, Drone, ...",...,0,0,42,26,0,0,3,0,0,0


In [149]:
with open('all_cols.txt', 'r') as f:
    all_cols = json.loads(f.read())
    
missing_cols = [col for col in all_cols if col not in df_cleaned.columns]
missing_vals = [0] * 159
missing_df = pd.DataFrame([missing_vals], columns=missing_cols)
df_cleaned = pd.concat([df_cleaned, missing_df], axis=1)

In [118]:
missing_cols = [col for col in numerical_cols if col not in df_cleaned.columns]

Unnamed: 0,path,total_gameloops,gameloop,build,map,player_1,player_2,player_1_units,player_2_units,player_1_Colossus,...,map_시뮬레이크럼 - 래더,map_에버 드림 - 래더,map_옥사이드 - 래더,map_월드 오브 슬리퍼스 - 래더,map_이터널 엠파이어 - 래더,map_이페머론 - 래더,map_자가나타 - 래더,map_젠 - 래더,map_트라이튼 - 래더,map_필러스 오브 골드 - 래더
0,ASUS ROG Online 2020\ASUS_ROG_Online_2020_repl...,24725,23381,82457,Romanticide LE,&lt;Ex0n&gt;<sp/>MaxPax,Rogue,"[Nexus, Probe, Probe, Probe, Probe, Probe, Pro...","[Hatchery, Drone, Drone, Drone, Drone, Drone, ...",4,...,0,0,0,0,0,0,0,0,0,0


In [135]:
with open('all_cols.txt', 'w') as f:
    f.writelines(json.dumps(numerical_cols))

In [97]:
df_cleaned

Unnamed: 0,path,total_gameloops,gameloop,build,map,player_1,player_2,player_1_units,player_2_units,player_1_Colossus,...,map_시뮬레이크럼 - 래더,map_에버 드림 - 래더,map_옥사이드 - 래더,map_월드 오브 슬리퍼스 - 래더,map_이터널 엠파이어 - 래더,map_이페머론 - 래더,map_자가나타 - 래더,map_젠 - 래더,map_트라이튼 - 래더,map_필러스 오브 골드 - 래더
0,ASUS ROG Online 2020\ASUS_ROG_Online_2020_repl...,24725,23381,82457,Romanticide LE,&lt;Ex0n&gt;<sp/>MaxPax,Rogue,"[Nexus, Probe, Probe, Probe, Probe, Probe, Pro...","[Hatchery, Drone, Drone, Drone, Drone, Drone, ...",4,...,0,0,0,0,0,0,0,0,0,0


In [175]:
# map_encoded = model.ohe.fit(df_cleaned[['map']])
map_encoded = model.ohe.fit_transform(df_cleaned[['map']])
map_encoded_df = pd.DataFrame(map_encoded, columns=model.ohe.get_feature_names_out(['map']), index=df_cleaned.index)
df_cleaned = pd.concat([df_cleaned.drop('map', axis=1), map_encoded_df], axis=1)

numerical_cols = [col for col in df_cleaned.columns if df_cleaned[col].dtype in ['int64', 'float64'] and col not in ['winner', 'build']]
# num_scaled = model.scaler.fit(df_cleaned[numerical_cols])
num_scaled = model.scaler.fit_transform(df_cleaned[numerical_cols])
df_cleaned[numerical_cols] = num_scaled

combined_players_list = df_cleaned[['player_1', 'player_2']].values.tolist()
hashed_features = model.fh.fit_transform(combined_players_list).toarray()
hashed_features_df = pd.DataFrame(hashed_features, columns=[f'player_hash_{i}' for i in range(10)], index=df_cleaned.index)
df_cleaned_final = pd.concat([df_cleaned.drop(['player_1', 'player_2'], axis=1), hashed_features_df], axis=1)

In [178]:
map_encoded = model.ohe.transform(df_cleaned[['map']])
map_encoded_df = pd.DataFrame(map_encoded, columns=model.ohe.get_feature_names_out(['map']), index=df_cleaned.index)
df_cleaned = pd.concat([df_cleaned.drop('map', axis=1), map_encoded_df], axis=1)

In [180]:
numerical_cols = [col for col in df_cleaned.columns if str(df_cleaned[col].dtypes) in ['int64', 'float64'] and col not in ['winner', 'build']]
num_scaled = model.scaler.transform(df_cleaned[numerical_cols].values)
df_cleaned[numerical_cols] = num_scaled



ValueError: X has 103 features, but StandardScaler is expecting 201 features as input.

In [181]:

# combined_players_list = df_cleaned[['player_1', 'player_2']].values.tolist()
# hashed_features = model.fh.transform(combined_players_list).toarray()
# hashed_features_df = pd.DataFrame(hashed_features, columns=[f'player_hash_{i}' for i in range(10)], index=df_cleaned.index)
# df_cleaned_final = pd.concat([df_cleaned.drop(['player_1', 'player_2'], axis=1), hashed_features_df], axis=1)

In [68]:
json.dumps(data.loc[[0]].to_dict('records'))

'[{"path": "ASUS ROG Online 2020\\\\ASUS_ROG_Online_2020_replays\\\\1 - Group Stage\\\\Group A\\\\20-11-27 14_31_15 - [Ex0n]MaxPax vs Rogue - Romanticide LE.SC2Replay", "total_gameloops": 24725, "gameloop": 23381, "build": 82457, "winner": 2, "map": "Romanticide LE", "player_1": "&lt;Ex0n&gt;<sp/>MaxPax", "player_2": "Rogue", "player_1_units": ["Nexus", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Adept", "Probe", "Probe", "Probe", "Probe", "VoidRay", "Adept", "Probe", "Probe", "Probe", "Probe", "Probe", "Oracle", "Probe", "Probe", "Probe", "Probe", "Probe", "Oracle", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "VoidRay", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Immortal", "Probe", "Immo

In [74]:
from io import StringIO


In [70]:
pred_row = [{"path": "ASUS ROG Online 2020\\\\ASUS_ROG_Online_2020_replays\\\\1 - Group Stage\\\\Group A\\\\20-11-27 14_31_15 - [Ex0n]MaxPax vs Rogue - Romanticide LE.SC2Replay", "total_gameloops": 24725, "gameloop": 23381, "build": 82457, "winner": 2, "map": "Romanticide LE", "player_1": "&lt;Ex0n&gt;<sp/>MaxPax", "player_2": "Rogue", "player_1_units": ["Nexus", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Adept", "Probe", "Probe", "Probe", "Probe", "VoidRay", "Adept", "Probe", "Probe", "Probe", "Probe", "Probe", "Oracle", "Probe", "Probe", "Probe", "Probe", "Probe", "Oracle", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "VoidRay", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Probe", "Immortal", "Probe", "Immortal", "ChangelingZealot", "ChangelingZealot", "Immortal", "WarpPrism", "ChangelingZealot", "Observer", "Probe", "Probe", "Probe", "Colossus", "Probe", "ChangelingZealot", "Probe", "Probe", "Probe", "Probe", "Colossus", "Probe", "Probe", "Colossus", "Probe", "Probe", "Observer", "Probe", "Probe", "Probe", "Disruptor", "ChangelingZealot", "ChangelingZealot", "ChangelingZealot", "Colossus", "Probe", "Probe", "ChangelingZealot", "Disruptor", "Disruptor", "Disruptor", "Disruptor", "ChangelingZealot", "ChangelingZealot", "Disruptor", "ChangelingZealot", "Disruptor", "Disruptor", "Probe", "Disruptor", "Disruptor", "Oracle", "Disruptor", "VoidRay", "Disruptor", "ChangelingZealot", "ChangelingZealot", "ChangelingZealot", "Observer", "Disruptor"], "player_2_units": ["Hatchery", "Drone", "Drone", "Drone", "Drone", "Drone", "Drone", "Overlord", "Drone", "Drone", "Drone", "Overlord", "Drone", "Queen", "Overlord", "Overlord", "Drone", "Drone", "Queen", "Overlord", "Overlord", "Drone", "Queen", "Queen", "Drone", "Drone", "Drone", "Drone", "Drone", "Queen", "Drone", "Drone", "Drone", "Drone", "Overlord", "Drone", "Drone", "Drone", "Overlord", "Drone", "Drone", "Overlord", "Drone", "Drone", "Drone", "Roach", "Overlord", "Overlord", "Drone", "Drone", "Drone", "Drone", "Drone", "Drone", "Drone", "Drone", "Drone", "Drone", "Drone", "Drone", "Drone", "Drone", "Drone", "Drone", "Overlord", "Drone", "Overlord", "Overlord", "Overlord", "Overlord", "Overlord", "Drone", "Overlord", "Drone", "Drone", "Drone", "Drone", "Drone", "Drone", "Drone", "Drone", "Overlord", "Overlord", "Overlord", "Drone", "Overlord", "Hydralisk", "Hydralisk", "Larva", "Larva", "Larva", "Larva", "Larva", "Larva", "Larva", "Larva", "Drone", "Overlord", "Larva", "Larva", "Larva", "Hydralisk", "Hydralisk", "Larva", "Larva", "Larva", "Larva", "Larva", "Drone", "Drone", "Drone", "Drone", "Drone", "Drone", "Drone", "Larva", "Larva", "Larva", "Larva", "Larva", "Larva", "Larva", "Larva", "Larva", "Larva", "Larva", "Hydralisk", "Hydralisk", "Hydralisk", "Hydralisk", "Viper", "Viper", "Viper", "Larva", "Hydralisk", "Hydralisk", "Hydralisk", "Hydralisk", "Hydralisk", "Hydralisk", "Hydralisk", "Hydralisk", "Drone", "Drone", "Hydralisk", "Hydralisk", "Hydralisk", "Hydralisk", "Hydralisk", "Hydralisk", "Hydralisk", "Hydralisk", "Hydralisk", "Hydralisk", "Drone", "Drone", "Drone", "Drone", "Larva", "Larva", "Larva", "Larva", "Drone", "Drone", "Drone", "Drone", "Larva", "Larva", "Larva", "Larva", "Larva", "Larva", "Larva", "Larva", "Larva", "Zergling", "Zergling", "Zergling", "Zergling", "Zergling", "Zergling", "Larva"]}]

In [77]:
pred_df = pd.read_json(StringIO(json.dumps(data.drop('winner', axis=1).loc[[0]].to_dict('records'))))

In [32]:
map_encoded_df

Unnamed: 0,map_2000 Atmospheres LE,map_2000大氣壓力 - 天梯版,map_Beckett Industries LE,map_Blackburn LE,map_Concord LE,map_Deathaura LE,map_Domaine des dormeurs EC,map_Efemeryda ER,map_Empire éternel EC,map_Ephemeron LE,...,map_시뮬레이크럼 - 래더,map_에버 드림 - 래더,map_옥사이드 - 래더,map_월드 오브 슬리퍼스 - 래더,map_이터널 엠파이어 - 래더,map_이페머론 - 래더,map_자가나타 - 래더,map_젠 - 래더,map_트라이튼 - 래더,map_필러스 오브 골드 - 래더
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2519,,,,,,,,,,,...,,,,,,,,,,
2520,,,,,,,,,,,...,,,,,,,,,,
2521,,,,,,,,,,,...,,,,,,,,,,
2522,,,,,,,,,,,...,,,,,,,,,,


In [30]:
model.ohe.get_feature_names_out(['map'])

array(['map_2000 Atmospheres LE', 'map_2000大氣壓力 - 天梯版',
       'map_Beckett Industries LE', 'map_Blackburn LE', 'map_Concord LE',
       'map_Deathaura LE', 'map_Domaine des dormeurs EC',
       'map_Efemeryda ER', 'map_Empire éternel EC', 'map_Ephemeron LE',
       'map_Eternal Empire LE', 'map_Ever Dream LE',
       'map_Ewiges Imperium LE', 'map_Golden Wall LE',
       'map_Goldene Säulen LE', 'map_Jagannatha', 'map_Jagannatha LE',
       'map_Lightshade LE', 'map_Nightshade LE', 'map_Nocny Mrok ER',
       'map_Oxide LE', 'map_Pillars of Gold LE',
       'map_Purity and Industry LE', 'map_Rhoskallian LE',
       'map_Romanticide LE', 'map_Simulacrum LE', 'map_Submarine LE',
       'map_Triton EC', 'map_Triton LE', 'map_Welt der Schläfer LE',
       'map_World of Sleepers LE', 'map_Zen LE', 'map_世界主宰-天梯版',
       'map_休眠者之境 - 天梯版', 'map_光影交错-天梯版', 'map_大气2000-天梯版',
       'map_札格納特 - 天梯版', 'map_毒茄樹叢 - 天梯版', 'map_永恆帝國 - 天梯版',
       'map_海神信使 - 天梯版', 'map_紫晶浪漫-天梯版', 'map_羅曼死 - 天梯版', 

In [25]:
ohe = OneHotEncoder(sparse_output=False)
map_encoded = ohe.fit_transform(df_cleaned[['map']])

In [26]:
ohe.get_feature_names_out(['map'])

array(['map_2000 Atmospheres LE', 'map_2000大氣壓力 - 天梯版',
       'map_Beckett Industries LE', 'map_Blackburn LE', 'map_Concord LE',
       'map_Deathaura LE', 'map_Domaine des dormeurs EC',
       'map_Efemeryda ER', 'map_Empire éternel EC', 'map_Ephemeron LE',
       'map_Eternal Empire LE', 'map_Ever Dream LE',
       'map_Ewiges Imperium LE', 'map_Golden Wall LE',
       'map_Goldene Säulen LE', 'map_Jagannatha', 'map_Jagannatha LE',
       'map_Lightshade LE', 'map_Nightshade LE', 'map_Nocny Mrok ER',
       'map_Oxide LE', 'map_Pillars of Gold LE',
       'map_Purity and Industry LE', 'map_Rhoskallian LE',
       'map_Romanticide LE', 'map_Simulacrum LE', 'map_Submarine LE',
       'map_Triton EC', 'map_Triton LE', 'map_Welt der Schläfer LE',
       'map_World of Sleepers LE', 'map_Zen LE', 'map_世界主宰-天梯版',
       'map_休眠者之境 - 天梯版', 'map_光影交错-天梯版', 'map_大气2000-天梯版',
       'map_札格納特 - 天梯版', 'map_毒茄樹叢 - 天梯版', 'map_永恆帝國 - 天梯版',
       'map_海神信使 - 天梯版', 'map_紫晶浪漫-天梯版', 'map_羅曼死 - 天梯版', 

In [33]:
# One-hot encode the 'map' variable
ohe = OneHotEncoder(sparse_output=False)
map_encoded = ohe.fit_transform(df_cleaned[['map']])
map_encoded_df = pd.DataFrame(map_encoded, columns=ohe.get_feature_names_out(['map']), index=df_cleaned.index)
# Drop the original 'map' column and concatenate the one-hot encoded map columns
# df_cleaned = pd.concat([df_cleaned.drop('map', axis=1), map_encoded_df], axis=1)

In [35]:
map_encoded

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [309]:
df['season'] = df['path'].str.split('\\').str[0]

In [None]:
json_data = pd.read_json('data-old/replay_summaries.json')


In [209]:
df['season'].value_counts()

season
DH SC2 Masters 2021 Summer                 791
IEM Katowice 2020                          438
IEM Katowice 2021                          261
DH SC2 Masters 2020 Winter                 190
StayAtHome Story Cup #3                    183
TeamLiquid StarLeague 5                    183
StayAtHome Story Cup #1                    172
TeamLiquid StarLeague 6                    121
ASUS ROG Online 2020                       107
Cheeseadelphia Winter Championship 2021     72
Name: count, dtype: int64

In [198]:
df['path'][0]

'ASUS ROG Online 2020\\ASUS_ROG_Online_2020_replays\\1 - Group Stage\\Group A\\20-11-27 14_31_15 - [Ex0n]MaxPax vs Rogue - Romanticide LE.SC2Replay'

In [338]:
X = df.drop(['path', 'winner', 'map', 'player_1_units', 'player_2_units'], axis=1)
y = df['winner'] - 1  # Adjusting target to 0-based

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [339]:

# Initialize the models
lr = LogisticRegression(max_iter=1000)
rf = RandomForestClassifier(n_estimators=100)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Train the models
lr.fit(X_train, y_train)
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)


In [340]:

# Predictions
lr_predictions = lr.predict(X_test)
rf_predictions = rf.predict(X_test)
xgb_predictions = xgb.predict(X_test)

# Evaluation
print("Logistic Regression:\n", classification_report(y_test, lr_predictions))
print("Random Forest:\n", classification_report(y_test, rf_predictions))
print("XGBoost:\n", classification_report(y_test, xgb_predictions))


Logistic Regression:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       240
           1       0.52      1.00      0.69       264

    accuracy                           0.52       504
   macro avg       0.26      0.50      0.34       504
weighted avg       0.27      0.52      0.36       504

Random Forest:
               precision    recall  f1-score   support

           0       0.67      0.63      0.65       240
           1       0.68      0.72      0.70       264

    accuracy                           0.68       504
   macro avg       0.68      0.68      0.68       504
weighted avg       0.68      0.68      0.68       504

XGBoost:
               precision    recall  f1-score   support

           0       0.66      0.64      0.65       240
           1       0.68      0.69      0.69       264

    accuracy                           0.67       504
   macro avg       0.67      0.67      0.67       504
weighted avg       0.67   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [93]:
from flask import Flask, request, jsonify
app = Flask(__name__)

# Assuming the model is trained and saved as 'model.pkl'
# with open('model.pkl', 'rb') as file:
#     model = pickle.load(file)

@app.route('/predict', methods=['POST'])
def predict():
    json_data = request.json
    data = load_and_preprocess_data(json_data)  # Use the preprocessing function
    # Perform feature engineering, scaling, etc.
    # Assume transformed_data is what you get after preprocessing and feature engineering
    prediction = model.predict(transformed_data)
    return jsonify({'prediction': prediction.tolist()})

if __name__ == '__main__':
    app.run(debug=True)


Unnamed: 0,path,map,player_1,player_2,player_1_units,player_2_units
count,2524,2524,2524,2524,2524,2524
unique,2524,62,234,228,2514,2508
top,ASUS ROG Online 2020\ASUS_ROG_Online_2020_repl...,Romanticide LE,Reynor,ShoWTimE,"[Nexus, Probe, Probe, Probe, Probe, Probe, Pro...","[Nexus, Probe, Probe, Probe, Probe, Probe, Pro..."
freq,1,286,74,78,2,3


In [94]:
data.describe().round()

Unnamed: 0,total_gameloops,gameloop,build,winner
count,2524.0,2524.0,2524.0,2518.0
mean,16346.0,15003.0,82104.0,2.0
std,7641.0,7639.0,2684.0,1.0
min,12.0,0.0,78285.0,1.0
25%,11585.0,10241.0,78285.0,1.0
50%,14974.0,13630.0,82893.0,2.0
75%,19368.0,18024.0,84643.0,2.0
max,92369.0,91025.0,84643.0,2.0


In [185]:
json_data

Unnamed: 0,path,total_gameloops,gameloop,build,winner,map,player_1,player_2,player_1_units,player_2_units
0,ASUS ROG Online 2020\ASUS_ROG_Online_2020_repl...,24725,23381,82457,2.0,Romanticide LE,&lt;Ex0n&gt;<sp/>MaxPax,Rogue,"[Nexus, Probe, Probe, Probe, Probe, Probe, Pro...","[Hatchery, Drone, Drone, Drone, Drone, Drone, ..."
1,ASUS ROG Online 2020\ASUS_ROG_Online_2020_repl...,18078,16734,82457,2.0,Oxide LE,&lt;Ex0n&gt;<sp/>MaxPax,Rogue,"[Nexus, Probe, Probe, Probe, Probe, Probe, Pro...","[Hatchery, Drone, Drone, Drone, Drone, Drone, ..."
2,ASUS ROG Online 2020\ASUS_ROG_Online_2020_repl...,12568,11224,82457,1.0,Deathaura LE,Rogue,&lt;Ex0n&gt;<sp/>MaxPax,"[Hatchery, Larva, Larva, Larva, Drone, Drone, ...","[Nexus, Probe, Probe, Probe, Probe, Probe, Pro..."
3,ASUS ROG Online 2020\ASUS_ROG_Online_2020_repl...,10201,8857,82457,2.0,Pillars of Gold LE,&lt;인투더&gt;<sp/>SpeCial,PartinG,"[CommandCenter, SCV, SCV, SCV, SCV, SCV, SCV, ...","[Nexus, Probe, Probe, Probe, Probe, Probe, Pro..."
4,ASUS ROG Online 2020\ASUS_ROG_Online_2020_repl...,19992,18648,82457,1.0,Romanticide LE,&lt;인투더&gt;<sp/>SpeCial,PartinG,"[CommandCenter, SCV, SCV, SCV, SCV, SCV, SCV, ...","[Nexus, Probe, Probe, Probe, Probe, Probe, Pro..."
...,...,...,...,...,...,...,...,...,...,...
2519,TeamLiquid StarLeague 6\TSL6 Replay Pack\Upper...,21000,19656,82893,2.0,Pillars of Gold LE,&lt;xkom&gt;<sp/>AgoElazer,ByuN,"[Hatchery, Larva, Larva, Larva, Drone, Drone, ...","[CommandCenter, SCV, SCV, SCV, SCV, SCV, SCV, ..."
2520,TeamLiquid StarLeague 6\TSL6 Replay Pack\Upper...,51954,50610,82893,2.0,Romanticide LE,&lt;ENCE&gt;<sp/>Serral,ShoWTimE,"[Hatchery, Larva, Larva, Larva, Drone, Drone, ...","[Nexus, Probe, Probe, Probe, Probe, Probe, Pro..."
2521,TeamLiquid StarLeague 6\TSL6 Replay Pack\Upper...,15237,13893,82893,1.0,Jagannatha LE,&lt;ENCE&gt;<sp/>Serral,ShoWTimE,"[Hatchery, Larva, Larva, Larva, Drone, Drone, ...","[Nexus, Probe, Probe, Probe, Probe, Probe, Pro..."
2522,TeamLiquid StarLeague 6\TSL6 Replay Pack\Upper...,13558,12214,82893,1.0,Oxide LE,&lt;ENCE&gt;<sp/>Serral,ShoWTimE,"[Hatchery, Larva, Larva, Larva, Drone, Drone, ...","[Nexus, Probe, Probe, Probe, Probe, Probe, Pro..."


In [101]:
data = pd.read_json(json_data)
data.dropna(inplace=True)

In [102]:
# prompt: python code to get numeric and object columns
data = pd.read_json(json_data)
data.dropna(inplace=True)
numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
object_cols = data.select_dtypes(include='object').columns.tolist()

print("Numeric columns:", numeric_cols)
print("Object columns:", object_cols)


Numeric columns: ['total_gameloops', 'gameloop', 'build', 'winner']
Object columns: ['path', 'map', 'player_1', 'player_2', 'player_1_units', 'player_2_units']


# EXPLORE

In [103]:
# Remove rows with missing 'winner' values
df_cleaned = data.dropna(subset=['winner'])

# Convert 'winner' to int as we know it should be either 1 or 2
df_cleaned['winner'] = df_cleaned['winner'].astype(int)
df_cleaned['player_1'] = df_cleaned['player_1'].str.replace('&lt;', '<').str.replace('&gt;', '>').str.replace('<sp/>', '')
df_cleaned['player_2'] = df_cleaned['player_2'].str.replace('&lt;', '<').str.replace('&gt;', '>').str.replace('<sp/>', '')
# Initial Data Understanding:
# Let's get a summary of numerical fields and a brief overview of some categorical fields.

# Summary of numerical fields
numerical_summary = df_cleaned.describe()

# Unique values in some categorical fields
unique_maps = df_cleaned['map'].nunique()
unique_players = pd.concat([df_cleaned['player_1'], df_cleaned['player_2']]).nunique()

numerical_summary, unique_maps, unique_players


(       total_gameloops      gameloop         build      winner
 count      2518.000000   2518.000000   2518.000000  2518.00000
 mean      16328.741859  14985.270850  82105.328038     1.50834
 std        7586.949271   7585.857511   2684.937481     0.50003
 min          12.000000      0.000000  78285.000000     1.00000
 25%       11586.250000  10242.250000  78285.000000     1.00000
 50%       14978.000000  13634.000000  82893.000000     2.00000
 75%       19368.000000  18024.000000  84643.000000     2.00000
 max       92369.000000  91025.000000  84643.000000     2.00000,
 61,
 242)

In [108]:
# prompt: python code to get numeric and object columns
data = pd.read_json(json_data)
data.dropna(inplace=True)
numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
object_cols = data.select_dtypes(include='object').columns.tolist()

print("Numeric columns:", numeric_cols)
print("Object columns:", object_cols)


# Function to parse the string representation of lists and count unit types
def parse_and_count_units(unit_str):
    # Convert the string representation of the list into an actual list
    # units_list = literal_eval(unit_str)
    # Count the occurrence of each unit type
    unit_counts = Counter(unit_str)
    return unit_counts

# Apply the function to both player_1_units and player_2_units columns
player_1_units_counts = df_cleaned['player_1_units'].apply(parse_and_count_units)
player_2_units_counts = df_cleaned['player_2_units'].apply(parse_and_count_units)


# Combine all unit counts to identify all unique units in the dataset
all_unit_counts = pd.concat([player_1_units_counts, player_2_units_counts])
all_unique_units = set(unit for counts in all_unit_counts for unit in counts)

print(f"Unique Units: {len(all_unique_units)}")

player_1_cols = [f'player_1_{unit}' for unit in all_unique_units]
player_2_cols = [f'player_2_{unit}' for unit in all_unique_units]
new_cols_df = pd.DataFrame(0, index=df_cleaned.index, columns=player_1_cols + player_2_cols)
df_cleaned = pd.concat([df_cleaned, new_cols_df], axis=1)

# Populate the unit count columns for each player
for index, row in df_cleaned.iterrows():
    for unit, count in player_1_units_counts[index].items():
        df_cleaned.loc[index, f'player_1_{unit}'] = count
    for unit, count in player_2_units_counts[index].items():
        df_cleaned.loc[index, f'player_2_{unit}'] = count

Unique Units: 69


In [112]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [175]:
unit_columns = [col for col in df_prepared_final_corrected.columns if 'player_1_' in col and col.replace('player_1_', 'player_2_') in df_prepared_final_corrected.columns and col != 'player_1_units']
for unit_col in unit_columns:
    print(unit_col)
    player_2_col = unit_col.replace('player_1_', 'player_2_')
    differential_col = unit_col.replace('player_1_', 'diff_')
    df_prepared_final_corrected[differential_col] = df_prepared_final_corrected[unit_col] - df_prepared_final_corrected[player_2_col]
    break

player_1_Hatchery


In [None]:
df_prepared_final_corrected[player_2_col]

In [156]:
# Assuming 'df_prepared_final_corrected' contains all preprocessed data including unit counts
unit_columns = [col for col in df_prepared_final_corrected.columns if 'player_1_' in col and col.replace('player_1_', 'player_2_') in df_prepared_final_corrected.columns]

for unit_col in unit_columns:
    player_2_col = unit_col.replace('player_1_', 'player_2_')
    differential_col = unit_col.replace('player_1_', 'diff_')
    df_prepared_final_corrected[differential_col] = df_prepared_final_corrected[unit_col] - df_prepared_final_corrected[player_2_col]

# Update X with new differential features for retraining
X = df_prepared_final_corrected.drop(['path', 'winner', 'player_1_units', 'player_2_units'], axis=1)


TypeError: unsupported operand type(s) for -: 'list' and 'list'

In [117]:
importances = rf.feature_importances_
features = X_train.columns
indices = np.argsort(importances)[::-1]


In [133]:
'player_hash_0' in features 

True

In [135]:
importances = pd.DataFrame([indices], columns=[features]).T.sort_values(by=0, ascending=True).reset_index()

In [137]:
importances[importances['level_0'].isin(hashed_features_df_corrected.columns)] 

Unnamed: 0,level_0,0
98,player_hash_5,98
101,player_hash_0,101
115,player_hash_8,115
128,player_hash_1,128
135,player_hash_3,135
153,player_hash_7,153
156,player_hash_2,156
160,player_hash_4,160
168,player_hash_6,168
170,player_hash_9,170


In [143]:
data['path'].str.split('\\').str[0].value_counts()

path
DH SC2 Masters 2021 Summer                 791
IEM Katowice 2020                          438
IEM Katowice 2021                          261
DH SC2 Masters 2020 Winter                 190
StayAtHome Story Cup #3                    183
TeamLiquid StarLeague 5                    183
StayAtHome Story Cup #1                    172
TeamLiquid StarLeague 6                    121
ASUS ROG Online 2020                       107
Cheeseadelphia Winter Championship 2021     72
Name: count, dtype: int64

In [155]:
data['path'].str.split('\\').str[7].value_counts()

path
20210527_-_GAME_1_-_XY_vs_Firefly_-_T_vs_P_-_Romanticide_LE.SC2Replay          1
20210527_-_GAME_2_-_XY_vs_Firefly_-_T_vs_P_-_2000_Atmospheres_LE.SC2Replay     1
20210526 - GAME 1 - XY vs Dragon - T vs T - Blackburn.SC2Replay                1
20210526 - GAME 2 - XY vs Dragon - T vs T - Beckett Industries.SC2Replay       1
20210526 - GAME 3 - XY vs Dragon - T vs T - Oxide.SC2Replay                    1
20210526_-_GAME_1_-_Firefly_vs_TIME_-_P_vs_T_-_2000_Atmospheres.SC2Replay      1
20210526_-_GAME_2_-_Firefly_vs_TIME_-_P_vs_T_-_Jagannatha.SC2Replay            1
20210527_-_GAME_1_-_Cyan_vs_Jieshi_-_P_vs_P_-_Oxide_LE.SC2Replay               1
20210527_-_GAME_2_-_Cyan_vs_Jieshi_-_P_vs_P_-_2000_Atmospheres_LE.SC2Replay    1
20210527_-_GAME_1_-_Jieshi_vs_Coffee_-_P_vs_T_-_Oxide_LE.SC2Replay             1
20210527_-_GAME_2_-_Jieshi_vs_Coffee_-_P_vs_T_-_Romanticide_LE.SC2Replay       1
Name: count, dtype: int64

In [154]:
data['path'].str.split('\\').str[2].value_counts()

path
4 - Europe                                          351
Ro76                                                254
2 - North America                                   172
Upper Bracket                                       150
Lower Bracket                                       148
                                                   ... 
Namshar, SKillous Romanticide LE.SC2Replay            1
Namshar, uwuThermal Pillars of Gold LE.SC2Replay      1
SKillous, Dream Pillars of Gold LE.SC2Replay          1
uwuThermal, Dream Lightshade LE.SC2Replay             1
goblin, Zoun Jagannatha LE.SC2Replay                  1
Name: count, Length: 75, dtype: int64