In [22]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import joblib  # Import joblib for model saving

In [23]:
# Load the training dataset
train_data = pd.read_csv('train_data.csv', delimiter=';')
train_data.columns = ['PlayerURL', 'PlayerID', 'PlayerName', 'Race'] + [f'Move_{i}' for i in range(1, 2564)]

In [24]:
# Drop unnecessary columns
train_data = train_data.drop(['PlayerURL', 'PlayerName'], axis=1)
train_data.head()

Unnamed: 0,PlayerID,Race,Move_1,Move_2,Move_3,Move_4,Move_5,Move_6,Move_7,Move_8,...,Move_2554,Move_2555,Move_2556,Move_2557,Move_2558,Move_2559,Move_2560,Move_2561,Move_2562,Move_2563
0,1021189,Terran,s,hotkey11,hotkey21,hotkey31,hotkey41,hotkey51,hotkey61,s,...,,,,,,,,,,
1,1021189,Terran,s,s,hotkey11,hotkey21,hotkey31,hotkey41,hotkey51,hotkey61,...,,,,,,,,,,
2,1021189,Terran,s,hotkey11,hotkey21,hotkey31,hotkey41,hotkey51,hotkey61,hotkey71,...,,,,,,,,,,
3,1021189,Terran,s,hotkey11,hotkey21,hotkey31,hotkey41,hotkey51,hotkey61,t5,...,,,,,,,,,,
4,1021189,Terran,s,hotkey11,hotkey21,hotkey31,hotkey41,hotkey51,hotkey71,hotkey61,...,,,,,,,,,,


In [25]:
# Map race to numeric values
race_mapping = {'Protoss': 0, 'Zerg': 1, 'Terran': 2}
train_data['Race'] = train_data['Race'].map(race_mapping)

# Map actions to numeric values
action_mapping = {'s': 0, 'Base': 1, 'SingleMineral': 2}
for i in range(10):
    for j in range(3):
        action_mapping[f'hotkey{i}{j}'] = 3 + i * 3 + j

# Convert action sequences to numerical values
#if tXX it converts it to 100 otherwise -1
for i in range(1, 2564):
    train_data[f'Move_{i}'] = train_data[f'Move_{i}'].map(lambda x: 100 if pd.notna(x) and isinstance(x, str) and x.startswith('t') else action_mapping.get(x, -1))

train_data.head()

Unnamed: 0,PlayerID,Race,Move_1,Move_2,Move_3,Move_4,Move_5,Move_6,Move_7,Move_8,...,Move_2554,Move_2555,Move_2556,Move_2557,Move_2558,Move_2559,Move_2560,Move_2561,Move_2562,Move_2563
0,1021189,2,0,7,10,13,16,19,22,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,1021189,2,0,0,7,10,13,16,19,22,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,1021189,2,0,7,10,13,16,19,22,25,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,1021189,2,0,7,10,13,16,19,22,100,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,1021189,2,0,7,10,13,16,19,25,22,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [26]:
# Create an empty list to store counts for each row
row_action_counts = []

# Iterate through each row of the dataframe
for _, row in train_data.iterrows():
    # Initialize a counter for each time window
    action_count_before_time = 0

    # Initialize a dictionary to store counts for each time window
    counts_before_100 = {}

    count_100 = 1

    # Iterate through each 'Move_XX' column for the current row
    for col in train_data.columns[3:]: 
        # Check if the value is different from -1
        if row[col] != -1:
            action_count_before_time += 1

            # Check if the value is 100
            if row[col] == 100:
                timestamp = count_100 * 5 
                counts_before_100[f't{timestamp}'] = action_count_before_time
                action_count_before_time = -1 
                count_100 += 1

    #If no action is found, set count to 0
    if not action_count_before_time:
        action_count_before_time = 0

    # Append the counts for the current row to the list
    row_action_counts.append(counts_before_100)

In [27]:
# Create a csv from the results
result_df = pd.DataFrame(row_action_counts)

# Add the 'PlayerID' and 'Race' column
result_df.insert(0, 'PlayerID', train_data['PlayerID'])
result_df.insert(1, 'Race', train_data['Race'])

# Save the DataFrame to a CSV file
result_df.to_csv('move_count.csv', index=False)

In [28]:
td = pd.read_csv('move_count.csv')

In [29]:
td.head()

Unnamed: 0,PlayerID,Race,t5,t10,t15,t20,t25,t30,t35,t40,...,t2125,t2130,t2135,t2140,t2145,t2150,t2155,t2160,t2165,t2170
0,1021189,2,11.0,8.0,7.0,12.0,7.0,1.0,7.0,7.0,...,,,,,,,,,,
1,1021189,2,8.0,9.0,12.0,1.0,11.0,9.0,21.0,12.0,...,,,,,,,,,,
2,1021189,2,8.0,16.0,5.0,7.0,9.0,12.0,11.0,10.0,...,,,,,,,,,,
3,1021189,2,7.0,15.0,20.0,10.0,15.0,9.0,7.0,10.0,...,,,,,,,,,,
4,1021189,2,9.0,12.0,8.0,6.0,15.0,15.0,13.0,6.0,...,,,,,,,,,,


In [31]:
import pandas as pd

train_data.to_csv('train_data_1.csv', index=False)

# Load the two CSV files into two dataframes
df1 = pd.read_csv('train_data_1.csv')
df2 = pd.read_csv('move_count.csv')

# Merge the two dataframes based on a common column
merged_df = pd.merge(df1, df2, on=['PlayerID', 'Race'])

# Save the merged dataframe to a new CSV file
merged_df.to_csv('merged_file.csv', index=False)

mg = pd.read_csv('merged_file.csv', )

mg.head()

In [32]:
mg.head()

Unnamed: 0,PlayerID,Race,Move_1,Move_2,Move_3,Move_4,Move_5,Move_6,Move_7,Move_8,...,t2125,t2130,t2135,t2140,t2145,t2150,t2155,t2160,t2165,t2170
0,1021189,2,0,7,10,13,16,19,22,0,...,,,,,,,,,,
1,1021189,2,0,7,10,13,16,19,22,0,...,,,,,,,,,,
2,1021189,2,0,7,10,13,16,19,22,0,...,,,,,,,,,,
3,1021189,2,0,7,10,13,16,19,22,0,...,,,,,,,,,,
4,1021189,2,0,7,10,13,16,19,22,0,...,,,,,,,,,,


In [36]:
# get all the player IDs that exist in one list

player = train_data['PlayerID']

player_ids = []

for pl in player:
    if pl not in player_ids:
        player_ids.append(pl)

print(player_ids)

#check the Race and collect the 



[1021189, 1058669, 1139573, 1143713, 1173786, 1430346, 2048063, 2101268, 2115876, 219517, 2222468, 2383348, 2385865, 239335, 2452136, 250458, 251061, 2526293, 2639840, 2685580, 284466, 2896854, 2898004, 2910724, 2978202, 3010872, 3074362, 3090974, 315813, 326029, 3311551, 3368730, 3401218, 3434150, 3437681, 349650, 3533538, 3538115, 3585588, 3611718, 3746323, 377576, 377903, 3880481, 3886231, 3971497, 3973341, 4064798, 4085747, 4149248, 4234852, 4341883, 4854541, 4860568, 4863808, 498165, 788178, 884897, 950504, 1106851, 1126978, 1178662, 2240191, 2332264, 2333068, 2333312, 2334176, 2334376, 2340350, 2341466, 2341467, 2341492, 2341545, 2341575, 2341700, 2341765, 2341920, 2342120, 2342147, 2342269, 2342294, 2342299, 2342343, 2342419, 2342491, 2342671, 2342766, 2342785, 2342789, 2342794, 2342923, 2342983, 2343012, 2343067, 2343183, 2343240, 2343263, 2343479, 2343484, 2343531, 2343559, 2343607, 2343733, 2343852, 2343882, 2343910, 2343979, 2344031, 2344079, 2344081, 2344151, 2344207, 23442