# Peparacion del data frame

## Configuración inicial y carga de datos

In [2]:
import math
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings

warnings.filterwarnings('ignore')

plt.style.use('ggplot')
sns.set(style="whitegrid")
pd.set_option('display.max_columns', None)

Cargar archivo

In [3]:
import csv

with open('Anexo ET_demo_round_traces_2022.csv') as csvfile:
    df = pd.read_csv(csvfile, sep=";")

Limpiar nulos

In [4]:
df.dropna(how='any', inplace=True)
null_values_new = df.isnull().sum()
print(null_values_new[null_values_new > 0])

Series([], dtype: int64)


Quitar variables con las que no se van a usar en el modelo

In [5]:
#droped_vars = ("AbnormalMatch","TravelledDistance","Map","InternalTeamId","MatchId",
#               "RoundId","RoundWinner","Survived","RLethalGrenadesThrown","RNonLethalGrenadesThrown",
#               "PrimaryAssaultRifle","PrimarySniperRifle","PrimaryHeavy","PrimarySMG","PrimaryPistol",
#               "RoundAssists","RoundHeadshots","RoundFlankKills","MatchFlankKills","MatchAssists","MatchHeadshots",
#               "MatchKills","Unnamed: 0")

droped_vars = ("AbnormalMatch", "Unnamed: 0","TravelledDistance","MatchId","InternalTeamId","RoundId")

for var in droped_vars:
  df = df.drop(var, axis=1)


transformar objeto a int

In [6]:
# Transformar variables Objecto a int
try:
  df['TimeAlive'] = pd.to_numeric(df['TimeAlive'].str.replace('.', ''))
  df['FirstKillTime'] = pd.to_numeric(df['FirstKillTime'].str.replace('.', ''))
except Exception as e:
  print(f'{e}\n')


Transformar las variables categoricas Team y MatchWinner a binario.

Team:
* Terrorist: 0
* CounterTerrorist: 1

MatchWinner:
* True: 1
* False: 0

RoundWinner:
* True: 1
* False: 0

In [7]:
df.Team.replace(to_replace=['Terrorist', 'CounterTerrorist'], value=[0, 1], inplace=True )
df.MatchWinner.replace(to_replace=['False', 'True'], value=[0, 1], inplace=True )
df.RoundWinner.replace(to_replace=['False', 'True'], value=[0, 1], inplace=True )

for var in ("Team", "MatchWinner","RoundWinner"):
    df[var] = df[var].astype(int)

Map:
* de_inferno: 0
* de_nuke: 1
* de_mirage: 2 
* de_dust2: 3

In [8]:
df.Map.replace(to_replace=['de_inferno', 'de_nuke','de_mirage','de_dust2'], 
               value=list(range(0,4)), 
               inplace=True )

df['Map'] = df['Map'].astype(int)


In [9]:
df.head()

Unnamed: 0,Map,Team,RoundWinner,MatchWinner,Survived,TimeAlive,RLethalGrenadesThrown,RNonLethalGrenadesThrown,PrimaryAssaultRifle,PrimarySniperRifle,PrimaryHeavy,PrimarySMG,PrimaryPistol,FirstKillTime,RoundKills,RoundAssists,RoundHeadshots,RoundFlankKills,RoundStartingEquipmentValue,TeamStartingEquipmentValue,MatchKills,MatchFlankKills,MatchAssists,MatchHeadshots
0,0,0,0,1,False,51120248995704500,0,4,0.0,0.0,0.0,0.0,1,0,0,0,0,0,750,4400,0,0,0,0
1,0,0,0,1,False,4348662552266170,0,0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,800,4400,0,0,0,0
2,0,0,0,1,False,3735469847054540,0,0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,1000,4400,0,0,0,0
3,0,0,0,1,False,4767886136441470,0,0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,850,4400,0,0,0,0
4,0,0,0,1,True,5312251089027310,1,1,0.0,0.0,0.0,0.0,1,0,0,0,0,0,1000,4400,0,0,0,0


In [10]:
df.head(15)

Unnamed: 0,Map,Team,RoundWinner,MatchWinner,Survived,TimeAlive,RLethalGrenadesThrown,RNonLethalGrenadesThrown,PrimaryAssaultRifle,PrimarySniperRifle,PrimaryHeavy,PrimarySMG,PrimaryPistol,FirstKillTime,RoundKills,RoundAssists,RoundHeadshots,RoundFlankKills,RoundStartingEquipmentValue,TeamStartingEquipmentValue,MatchKills,MatchFlankKills,MatchAssists,MatchHeadshots
0,0,0,0,1,False,51120248995704500,0,4,0.0,0.0,0.0,0.0,1,0,0,0,0,0,750,4400,0,0,0,0
1,0,0,0,1,False,4348662552266170,0,0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,800,4400,0,0,0,0
2,0,0,0,1,False,3735469847054540,0,0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,1000,4400,0,0,0,0
3,0,0,0,1,False,4767886136441470,0,0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,850,4400,0,0,0,0
4,0,0,0,1,True,5312251089027310,1,1,0.0,0.0,0.0,0.0,1,0,0,0,0,0,1000,4400,0,0,0,0
5,0,1,1,0,True,5312251089027310,0,0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,850,3750,0,0,0,0
6,0,1,1,0,True,5312251089027310,0,2,0.0,0.0,0.0,0.0,1,0,0,0,0,0,400,3750,0,0,0,0
7,0,1,1,0,True,5312251089027310,0,0,0.0,0.0,0.0,0.0,1,0,0,1,0,0,800,3750,0,0,1,0
8,0,1,1,0,True,5312251089027310,0,0,0.0,0.0,0.0,0.0,1,373547,3,0,1,1,850,3750,3,1,0,1
9,0,1,1,0,True,5312251089027310,0,0,0.0,0.0,0.0,0.0,1,5112025,2,0,2,0,850,3750,2,0,0,2


## Objetivo

Utilizar las variables para determinar el tiempo de vida (TimeAlive) del jugador

In [25]:
import sklearn.metrics
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [26]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import AdaBoostRegressor

In [27]:
#top_vars = ["TeamStartingEquipmentValue","RoundWinner",
#                "MatchKills","TimeAlive","FirstKillTime",
#                "RoundStartingEquipmentValue","MatchAssists",
#                "Map","Team","MatchHeadshots"]

top_vars = ["RoundStartingEquipmentValue"]

goal = "TeamStartingEquipmentValue"

x = df[top_vars]
y = df[goal]

#Xtrain, Xtest, Ytrain, Ytest = train_test_split(x,y,test_size=0.2)

PCA

In [28]:
scaler=StandardScaler()#instantiate
scaler.fit(x) # calcula la media y estandar para cada dimension
X_scaled=scaler.transform(x)# transforma los datos a su nueva escala

In [29]:
pca=PCA(n_components=len(top_vars))
#pca=PCA(.85)
pca.fit(X_scaled) # buscar los componentes principales
X_pca=pca.transform(X_scaled)
#revisemos la forma del array
print("shape of X", x.shape)
print("shape of X_pca", X_pca.shape)

shape of X (79154, 1)
shape of X_pca (79154, 1)


In [30]:
expl = pca.explained_variance_ratio_
print(expl)
print('suma:',sum(expl[0:len(top_vars)]))

[1.]
suma: 1.0


In [48]:
dfScaledPca = pd.DataFrame(X_pca, columns = x.columns)

Xtrain, Xtest, Ytrain, Ytest = train_test_split(x,y,test_size=0.2)

Adaboosting

In [49]:
model = AdaBoostRegressor(DecisionTreeRegressor())

params_features = np.arange(
                math.trunc(len(df.columns)/3),
                len(df.columns)
                )

params = {
            "n_estimators": np.arange(150,200,10),
            "learning_rate": np.arange(3,4,0.1)
          }

grid = GridSearchCV(estimator=model, param_grid=params,cv=10)
grid.fit(Xtrain, Ytrain)

In [50]:
print(grid.best_score_)
print(grid.best_params_)

0.8962422061526845
{'learning_rate': np.float64(3.3000000000000003), 'n_estimators': np.int64(180)}


In [51]:
best_estimator = grid.best_estimator_

In [52]:
filename = f'reg_model_{len(top_vars)}vars.pkl'
pickle.dump(best_estimator, open(filename, 'wb'))

In [53]:
Yhat = best_estimator.predict(Xtest)

mae = mean_absolute_error(Ytest, Yhat)
r2 = r2_score(Ytest, Yhat)

print("mean absolute error: ",mae)
print("r2: ",r2)

mean absolute error:  2150.5751933065767
r2:  0.8914885586863145
