In [43]:
# Dependencies
import pandas as pd
import json
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [22]:
# Load all CSV files
Fall2022   = pd.read_csv('csv/per_match/Fall2022.csv', delimiter=';')
Spring2024 = pd.read_csv('csv/per_match/Spring2024.csv', delimiter=';')

seasons = [Spring2024]

In [39]:
# Data Wrangling

# We will store all our matches here
matches_list = []

# Load JSON data from file into a Python dictionary
with open('json/username_mapping.json', 'r') as file:
    username_mapping = json.load(file)

for i, season in enumerate(seasons):
    # Drop unnecessary columns
    season = season.drop(['replay id', 'map', 'date', 'team name', 'opposing team name', 'car id', 'car name'], axis=1)
    
    # Make all player names lowercase
    season['player name'] = season['player name'].str.lower()
        
    # Replace mapped names with their dictionary value
    season['player name'] = season['player name'].map(username_mapping).fillna(season['player name'])
    
    # Convert 'result' to binary
    season['result'] = season['result'].map({'loss': 0, 'win': 1})
    
    # Group by replay name
    grouped_matches = season.groupby('replay title')
    season_matches_list = [match for _,match in grouped_matches]
    
    # And append to our result array
    matches_list.extend(season_matches_list)

In [40]:
# Display capijack's favorite number
print(matches_list[34])

            replay title  result        player name  score  goals  assists  \
1144  23366_Game2.replay       0  front flip freddy    365      0        1   
1145  23366_Game2.replay       0              tipsy    310      1        0   
1146  23366_Game2.replay       1               peak    825      5        1   
1147  23366_Game2.replay       1   renshirokamazaki    214      1        1   
1148  23366_Game2.replay       1               uday    297      0        2   

      saves  shots  shots conceded  goals conceded  ...  time neutral third  \
1144      3      2              12               6  ...              104.98   
1145      2      2              12               6  ...              123.39   
1146      0      8               4               1  ...              121.45   
1147      0      2               4               1  ...              126.31   
1148      1      2               4               1  ...              124.90   

      percentage neutral third  time offensive third  \


In [44]:
# machine learning time
all_matches = pd.concat(matches_list)

# Select features and target
X = all_matches[['goals', 'assists', 'saves', 'shots', 'shots conceded', 'goals conceded', 'demos inflicted', 'demos taken']]  # Add other relevant features if any
y = all_matches['result']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

           replay title  result                 player name  score  goals  \
522  11023_Game1.replay       1                        leon    684      2   
523  11023_Game1.replay       1                  tophatbear    384      2   
524  11023_Game1.replay       1  remy lacroix hula hoop gif    485      1   
525  11023_Game1.replay       0            senor brightside    560      2   
526  11023_Game1.replay       0                     vpr.vnm    346      0   

     assists  saves  shots  shots conceded  goals conceded  ...  \
522        1      3      4              11               3  ...   
523        0      1      3              11               3  ...   
524        3      1      5              11               3  ...   
525        1      2      3              12               5  ...   
526        2      2      5              12               5  ...   

     time neutral third  percentage neutral third  time offensive third  \
522              127.78                     33.56          

In [45]:
# Get the coefficients and intercept
coefficients = model.coef_[0]
intercept = model.intercept_[0]

# Print the coefficients and intercept
print("Intercept:", intercept)
print("Coefficients:", coefficients)

Intercept: 2.608710793539584
Coefficients: [ 2.36223923  2.23985388 -0.1758892  -0.18261044  0.069408   -2.29141381
  0.10673934 -0.14814408]


In [46]:
# Predict on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100.0:.2f}%')

# Feature importance
feature_importance = pd.DataFrame({'Feature': X.columns, 'Coefficient': coefficients})
print("\nFeature Importance:\n", feature_importance)

Accuracy: 89.21%

Feature Importance:
            Feature  Coefficient
0            goals     2.362239
1          assists     2.239854
2            saves    -0.175889
3            shots    -0.182610
4   shots conceded     0.069408
5   goals conceded    -2.291414
6  demos inflicted     0.106739
7      demos taken    -0.148144
