# Peparacion del data frame

## Configuración inicial y carga de datos

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

plt.style.use('ggplot')
sns.set(style="whitegrid")
pd.set_option('display.max_columns', None)

Cargar archivo

In [3]:
import csv

with open('Anexo ET_demo_round_traces_2022.csv') as csvfile:
    df = pd.read_csv(csvfile, sep=";")

Limpiar nulos

In [4]:
df.dropna(how='any', inplace=True)
null_values_new = df.isnull().sum()
print(null_values_new[null_values_new > 0])

Series([], dtype: int64)


Quitar variables con las que no se van a usar en el modelo

In [5]:
droped_vars = ("AbnormalMatch","TravelledDistance","Map","InternalTeamId","MatchId",
               "RoundId","RoundWinner","Survived","RLethalGrenadesThrown","RNonLethalGrenadesThrown",
               "PrimaryAssaultRifle","PrimarySniperRifle","PrimaryHeavy","PrimarySMG","PrimaryPistol",
               "RoundAssists","RoundHeadshots","RoundFlankKills","MatchFlankKills","MatchAssists","MatchHeadshots",
               "MatchKills","Unnamed: 0")

for var in droped_vars:
  df = df.drop(var, axis=1)


transformar objeto a int

In [6]:
# Transformar variables Objecto a int
try:
  df['TimeAlive'] = pd.to_numeric(df['TimeAlive'].str.replace('.', ''))
  df['FirstKillTime'] = pd.to_numeric(df['FirstKillTime'].str.replace('.', ''))
except Exception as e:
  print(f'{e}\n')


Transformar las variables categoricas Team y MatchWinner a binario.

Team:
* Terrorist: 0
* CounterTerrorist: 1

MatchWinner:
* True: 1
* False: 0

In [7]:
df.Team.replace(to_replace=['Terrorist', 'CounterTerrorist'], value=[0, 1], inplace=True )
df.MatchWinner.replace(to_replace=['False', 'True'], value=[0, 1], inplace=True )
df.MatchWinner = df.MatchWinner.astype(int)

In [8]:
df.head()

Unnamed: 0,Team,MatchWinner,TimeAlive,FirstKillTime,RoundKills,RoundStartingEquipmentValue,TeamStartingEquipmentValue
0,0,1,51120248995704500,0,0,750,4400
1,0,1,4348662552266170,0,0,800,4400
2,0,1,3735469847054540,0,0,1000,4400
3,0,1,4767886136441470,0,0,850,4400
4,0,1,5312251089027310,0,0,1000,4400


## Objetivo

Utilizar las variables para determinar el tiempo de vida (TimeAlive) del jugador

# Random Forest

In [9]:
import sklearn.metrics
from sklearn.model_selection import train_test_split

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV


In [10]:
x = np.array(df.drop("TimeAlive",axis=1))
y = np.array(df["TimeAlive"])
Xtrain, Xtest, Ytrain, Ytest = train_test_split(x,y,test_size=0.2)

In [12]:
model = RandomForestRegressor()

params = {"n_estimators": [100,200,300],
          "max_features": ['sqrt'],
          "max_depth": np.arange(4,7), 
          "min_samples_split": np.arange(4,7), 
          "max_leaf_nodes": np.arange(10,25,5)}

grid = GridSearchCV(estimator=model, param_grid=params,cv=5)
_=grid.fit(Xtrain, Ytrain)

In [13]:
print(grid.best_score_)
print(grid.best_params_)

0.011300718941044053
{'max_depth': np.int64(6), 'max_features': 'sqrt', 'max_leaf_nodes': np.int64(20), 'min_samples_split': np.int64(6), 'n_estimators': 200}


In [14]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

Yhat = grid.predict(Xtest)

mse = mean_squared_error(Ytest, Yhat)
mae = mean_absolute_error(Ytest, Yhat)
R2 = r2_score(Ytest, Yhat)

print("MSE: ",mse)
print("MAE: ",mae)
print("R^2: ",R2)

MSE:  1.9931296825487322e+32
MAE:  9493997887812936.0
R^2:  0.009990560678114768
