In [None]:
import pandas as pd
import numpy as np

In [None]:
#Connect to database

import mysql.connector
import time
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="password",
  database="valorant_tracker"
)
cursor = mydb.cursor() 

In [None]:
df = pd.read_sql("SELECT * FROM stats", mydb)
print("Number of samples: " + str(df.shape[0]))

In [None]:
df['user_id'] = df['match_player_id'].str[36:]


In [None]:
print(df.columns)

In [None]:
data = df.drop(["match_player_id", 
"user_id", 
"date", 
"rounds", 
"map", 
"result", 
"agent",
"smurf",
"current_rank",], axis=1)

#Should eventually take into account ability casts for each agent
data = data.drop(["grenade_casts",
"ability_2_casts",
"ability_1_casts",
"ultimate_casts"
], axis=1)


Working with the raw data is difficult since it is not very gaussian. Will instead average all stats and work with a dataset of users.

In [None]:
import matplotlib.pyplot as plt

plt.scatter(df["kills"], df["deaths"])

No validation set or test set being used to find optimal epsilon (should consider doing that, however, 0.000001 seems to work fairly well). After checking the accounts below, some of them appear to have two different users. One plays extremely well and the other plays poorly.

In [None]:
sub_data = df[["kills", "deaths", "user_id"]]
avg_data = sub_data.groupby("user_id").mean()


In [None]:
from sklearn.ensemble import IsolationForest

isof = IsolationForest(n_estimators=500, max_samples='auto', contamination=float(0.006))
clf = isof.fit(avg_data)

In [None]:
anomaly = clf.predict(avg_data)

In [None]:
test_anomaly = avg_data
test_anomaly["anomaly"] = anomaly.tolist()


In [None]:
anomalies = test_anomaly[(test_anomaly["anomaly"] == -1)]

In [None]:
anomalies.shape[0]

In [None]:
anom_true = anomalies[(anomalies["anomaly"] == -1)]

In [None]:
plt.plot(avg_data["kills"], avg_data["deaths"], 'g.')

plt.plot(anom_true["kills"].to_numpy(), anom_true["deaths"].to_numpy(), 'or',
         markersize= 10,markerfacecolor='none', markeredgewidth=2)
plt.xlabel("Kills")
plt.ylabel("Deaths")
plt.title("Kills Vs Deaths Outliers")

In [None]:
anom_true

In [None]:
high_dim_data = df.drop(["match_player_id", 
"date", 
"rounds", 
"map", 
"result", 
"agent",
"smurf",
"current_rank",
"grenade_casts",
"ability_2_casts",
"ability_1_casts",
"ultimate_casts",], axis=1)


In [None]:
avg_high_dim_data = high_dim_data.groupby("user_id").mean()
avg_high_dim_data.head()

In [None]:
import seaborn as sns
for col in avg_high_dim_data:
    sns.histplot(avg_high_dim_data[col], color="grey", label="100% Equities", kde=True, stat="density", linewidth=0)
    plt.title(col)
    plt.show()

In [None]:
from sklearn.ensemble import IsolationForest

isof = IsolationForest(n_estimators=500, max_samples='auto', contamination=float(0.006))
clf = isof.fit(avg_high_dim_data)

In [None]:
anomaly_high_dim = clf.predict(avg_high_dim_data)

In [None]:
test_anomaly_high_dim = avg_high_dim_data
test_anomaly_high_dim["anomaly"] = anomaly_high_dim.tolist()
anomalies_high_dim = test_anomaly_high_dim[(test_anomaly_high_dim["anomaly"] == -1)]
print("Anomalies flagged: " + str(anomalies_high_dim.shape[0]))
anom_true_high_dim = anomalies_high_dim[(anomalies_high_dim["anomaly"] == -1)]
anom_true_high_dim

In [None]:
anom_with_id = anom_true_high_dim.reset_index()
for user in anom_with_id["user_id"]:
    df.loc[df["user_id"] == user, 'smurf'] = 1

In [None]:
smurf_count = df.loc[df["smurf"] == 1].shape[0]
legit_count = df.loc[df["smurf"] == 0].shape[0]
print("Number of users flagged as a smurf: " + str(smurf_count))
print("Number of users not flagged: " + str(legit_count))

In [None]:
smurf_ratio = smurf_count / (smurf_count + legit_count)
print("Smurf percentage: %.2f%%" % (smurf_ratio * 100))

In [None]:
from sklearn.utils import resample

df_majority = df[df["smurf"]==0]
df_minority = df[df["smurf"]==1]

df_minority_upsampled = resample(df_minority,
                                replace=True,
                                    n_samples=legit_count,
                                    random_state=111)

df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled["smurf"].value_counts()

"Running the upsampled data through a simple random forest model\n"

In [None]:
from sklearn.model_selection import train_test_split
df_upsampled = df_upsampled.dropna()
df_upsampled = df_upsampled.sample(frac=1)
y = np.array(df_upsampled['smurf'])
X = df_upsampled.drop(['smurf', 'date', 'match_player_id', 'user_id'], axis = 1)


In [None]:
numeric = X[X.select_dtypes(exclude=['float64', 'int64', 'datetime64[ns]']).columns]
for feature in numeric:
    dummies = pd.get_dummies(X[feature])
    X = pd.concat([X, dummies], axis=1)
    X = X.drop([feature], axis=1)
print(X.head)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5)


"Using Random Forest because we have a very large data set. The main drawback is interprtability would be usefull to see where and why certain splits happen. However, we do have access to feature importance since we used Random Forest"

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
rf = RandomForestClassifier(n_estimators=100, max_features=9)

rf.fit(X_train, y_train)
rf_predict=rf.predict(X_test)

# #print confusion matrix and accuracy score\n",
rf_conf_matrix = confusion_matrix(y_test, rf_predict)
rf_acc_score = accuracy_score(y_test, rf_predict)
print(rf_conf_matrix)
accuracy = rf_acc_score*100
print(accuracy)

The radom forest model above takes about 23 minutes to execute so lets save it in a joblib dump to reuse. 


In [None]:
import joblib
joblib.dump(rf, "./Models/smurf_random_forest.joblib")

In [None]:
rf_loaded = joblib.load("./Models/smurf_random_forest.joblib")

In [None]:
imp_df =  pd.DataFrame()
imp_df["feature"] = X_train.columns
imp_df["importance"] = rf.feature_importances_
imp_df_sorted = imp_df.sort_values(by=["importance"])
plt.figure(figsize=(10, 20))
plt.barh(imp_df_sorted["feature"], imp_df_sorted["importance"])

This plot is very interesting and provides some more insight on what attributes can be focused on to detect smurfs. Before it was assumed that kills, deaths and headshot percentage would be the most important traits for smurf detection but as we can see here 'econ_rating', 'damage_recieved', 'ability_1_casts' are also considered very important features. It is also interesting to see how different agent and ranks differ importance. 