In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import joblib
warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
# Load the training dataset
train_data = pd.read_csv('train_data.csv', delimiter=';')

# Drop unnecessary columns
train_data = train_data.drop(['PlayerURL', 'PlayerName'], axis=1)

In [3]:
def count_moves(row, counts, index):
    total_moves = 0
    for i in range(1, 2564):
        move = row["Move "+ str(i)]
        # count the number of s's
        if move == 's':
            counts[10][index] += 1
        # count the number of Base's
        elif move == 'Base':
            counts[11][index] += 1
        # count the number of SingleMineral's
        elif move == 'SingleMineral':
            counts[12][index] += 1
        # count the hotkeys
        elif isinstance(move, str):
            for j in range(10):
                if move.startswith(f"hotkey{j}"):
                    counts[j][index] += 1

        total_moves += 1  
    # Save the total moves count
    counts[13][index] = total_moves


def count_move_per_time(row, counts, row_index, time_interval, ti_index):
    base_index = ti_index * 14
    total_moves = 0

    for i in range(1, 2564):
        move = row["Move " + str(i)]

        # Count actions for the given time interval
        if move == 's':
            counts[base_index + 10][row_index] += 1
        elif move == 'Base':
            counts[base_index + 11][row_index] += 1
        elif move == 'SingleMineral':
            counts[base_index + 12][row_index] += 1
        elif isinstance(move, str):
            for j in range(10):
                if move.startswith(f"hotkey{j}"):
                    counts[base_index + j][row_index] += 1

        total_moves += 1

        # Continue counting actions after the specified time interval
        if move == f't{time_interval}':
            break

    counts[base_index + 13][row_index] = total_moves


def mapRaces(races, row_index):
    race = train_data['Race'][row_index]

    if race == "Protoss":
        races[0][row_index] = 1
    elif race == "Terran":
        races[1][row_index] = 1
    elif race == "Zerg":
        races[2][row_index] = 1

In [4]:
# Create new table that only contains the first column (PlayerId) of train_data
# Keep only the first column but all rows
train_data_new = train_data.iloc[:, :1]


# Specify the target time intervals
#time_intervals = [20, 60, 100, 200]
time_intervals = [5, 20, 60, 100, 200, 270, 340, 550]

calc_column = len(time_intervals)* 14 + 14

# New lists of counts
counts = [[0] * 3052 for _ in range(calc_column)]
# New lists of races
races = [[0] * 3052 for _ in range(3)]


# Go through the rows using the functions to count the actions, map the races
for row_index, row in train_data.iterrows():
    count_moves(row, counts, row_index)
    mapRaces(races, row_index)

    for ti_index, time_interval in enumerate(time_intervals):
        count_move_per_time(row, counts, row_index, time_interval, ti_index+1)
        

for i in range(calc_column):
    locals()[f'count_{i}'] = counts[i]

for i in range(10):
    train_data_new[f'hk{i}Frequency'] = [count / counts[13][index] if counts[13][index] != 0 else 0 for index, count in enumerate(counts[i])]

train_data_new['sFrequency'] = [count / counts[13][index] if counts[13][index] != 0 else 0 for index, count in enumerate(counts[10])]
train_data_new['baseFrequency'] = [count / counts[13][index] if counts[13][index] != 0 else 0 for index, count in enumerate(counts[11])]
train_data_new['singleMineralFrequency'] = [count / counts[13][index] if counts[13][index] != 0 else 0 for index, count in enumerate(counts[12])]

# Adding new columns for the count of moves per interval
for ti_index, time_interval in enumerate(time_intervals):
    base_index = (ti_index + 1) * 14
    for j in range(10):
        column_name = f'hk{j}_t{time_interval}_Frequency'
        train_data_new[column_name] = [count / counts[base_index + 13][index] if counts[base_index + 13][index] != 0 else 0 for index, count in enumerate(counts[base_index + j])]

    train_data_new[f's_t{time_interval}_Frequency'] = [count / counts[base_index + 13][index] if counts[base_index + 13][index] != 0 else 0 for index, count in enumerate(counts[base_index + 10])]
    train_data_new[f'base_t{time_interval}_Frequency'] = [count / counts[base_index + 13][index] if counts[base_index + 13][index] != 0 else 0 for index, count in enumerate(counts[base_index + 11])]
    train_data_new[f'singleMineral_t{time_interval}_Frequency'] = [count / counts[base_index + 13][index] if counts[base_index + 13][index] != 0 else 0 for index, count in enumerate(counts[base_index + 12])]



# Adding new columns for the races
train_data_new['race_Protoss'] = races[0]
train_data_new['race_Terran'] = races[1]
train_data_new['race_Zerg'] = races[2]


train_data_new.head

<bound method NDFrame.head of       PlayerID  hk0Frequency  hk1Frequency  hk2Frequency  hk3Frequency  \
0      1021189      0.000000      0.158018      0.107296      0.032384   
1      1021189      0.000000      0.149044      0.062037      0.055014   
2      1021189      0.000000      0.130316      0.086617      0.018728   
3      1021189      0.000000      0.166602      0.120562      0.033554   
4      1021189      0.000000      0.154506      0.077253      0.022630   
...        ...           ...           ...           ...           ...   
3047       415      0.000000      0.047600      0.069840      0.015607   
3048       415      0.000000      0.074132      0.094030      0.030823   
3049       415      0.000000      0.105345      0.065158      0.021849   
3050       415      0.001561      0.090129      0.101053      0.017558   
3051       415      0.000000      0.062817      0.078424      0.024581   

      hk4Frequency  hk5Frequency  hk6Frequency  hk7Frequency  hk8Frequency  \
0  

In [5]:
# Target
labels = train_data_new['PlayerID']

# Keep only the columns we need as features
features = train_data_new.drop(['PlayerID'], axis=1)

# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=42)

# Train a model with all features
model = RandomForestClassifier(random_state=42, n_estimators=500)
model.fit(X_train, y_train)

# Get feature importances
feature_importances = model.feature_importances_


In [6]:
# ... (your existing code)

# Train a model with all features
model = RandomForestClassifier(random_state=42, n_estimators=500)
model.fit(X_train, y_train)

# Initialize variables
best_accuracy = 0.0
best_feature_subset = None
current_percentage = 0.75
step_size = 0.05  # Change the step size as needed
eliminated_features = {}  # Dictionary to store eliminated features and their percentages

# Initialize sorted_features outside the loop
sorted_features = model.feature_importances_.argsort()[::-1]

while current_percentage >= 0.1:
    # Calculate the number of features to keep
    num_features_to_keep = int(len(features.columns) * current_percentage)

    # Select the top N features based on importance
    selected_features = sorted_features[:num_features_to_keep]
    eliminated_features[current_percentage] = list(set(range(len(features.columns))) - set(selected_features))

    X_train_subset = X_train.iloc[:, selected_features]
    X_val_subset = X_val.iloc[:, selected_features]

    # Train the model with the selected features
    model.fit(X_train_subset, y_train)

    # Make predictions on the validation set
    predictions = model.predict(X_val_subset)

    # Calculate accuracy
    accuracy = accuracy_score(y_val, predictions)

    # Check if this subset of features gives a better accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_feature_subset = selected_features.copy()

    # Update current percentage
    current_percentage -= step_size

# Check if the best accuracy is above 93%, if not, use the best accuracy found
if best_accuracy < 0.93:
    best_feature_subset = sorted_features[:int(len(features.columns) * 0.1)].copy()

# Print the best accuracy and selected features
print("Best Accuracy:", best_accuracy)
print("Number of Features Selected:", len(best_feature_subset))

# Print eliminated features and their percentages
for percentage, eliminated in eliminated_features.items():
    print(f"Features Eliminated at {percentage * 100}%: {eliminated}")

# Select the best feature subset
final_features = X_train.iloc[:, best_feature_subset]

# Save the final feature set to a file
final_features.to_csv('final_features.csv', index=False)


Best Accuracy: 0.9279869067103109
Number of Features Selected: 12
Features Eliminated at 75.0%: [10, 12, 13, 19, 20, 21, 24, 25, 26, 33, 34, 35, 37, 38, 46, 47, 51, 60, 64, 74, 77, 85, 87, 90, 98, 100, 103, 116, 118, 119]
Features Eliminated at 70.0%: [7, 10, 12, 13, 14, 18, 19, 20, 21, 24, 25, 26, 32, 33, 34, 35, 37, 38, 46, 47, 51, 60, 64, 72, 73, 74, 77, 85, 87, 90, 98, 100, 103, 116, 118, 119]
Features Eliminated at 64.99999999999999%: [7, 9, 10, 12, 13, 14, 16, 18, 19, 20, 21, 24, 25, 26, 32, 33, 34, 35, 37, 38, 46, 47, 48, 51, 60, 64, 72, 73, 74, 77, 85, 86, 87, 90, 98, 100, 103, 111, 113, 116, 117, 118, 119]
Features Eliminated at 59.999999999999986%: [7, 9, 10, 12, 13, 14, 16, 18, 19, 20, 21, 22, 24, 25, 26, 32, 33, 34, 35, 37, 38, 46, 47, 48, 51, 59, 60, 64, 72, 73, 74, 77, 85, 86, 87, 90, 98, 99, 100, 102, 103, 111, 112, 113, 115, 116, 117, 118, 119]
Features Eliminated at 54.999999999999986%: [7, 8, 9, 10, 11, 12, 13, 14, 16, 18, 19, 20, 21, 22, 24, 25, 26, 32, 33, 34, 35, 3