In [257]:
!pip install scikit-learn feature_engine scikit-plot



In [259]:
import sqlite3
import pandas as pd
import scikitplot as skplt

from feature_engine import imputation
from feature_engine import encoding
from sklearn.pipeline import Pipeline
from sklearn import tree, ensemble, metrics

In [234]:
def import_query(path):
    with open(path, 'r') as file:
        query = file.read()
    return query

In [235]:
db_origin = sqlite3.connect(r'C:\Adam\Programação\Projetos\IA\MLGC\data\silver_gc.db')
query_abt = import_query(r'C:\Adam\Programação\Projetos\IA\MLGC\src\targetedFeatures.sql')
df = pd.read_sql(query_abt, db_origin)

# SEMMA
## Sample

In [236]:
columns = df.columns
target = ['flNaoChurn']
ids = ['dtRef', 'idPlayer']
to_remove = ['flAssinatura']

features = list(set(columns) - set(target) - set(ids) - set(to_remove))

In [237]:
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(df[features], df[target], test_size=0.2, random_state=42)

# Explore

In [238]:
X_train.dtypes[X_train.dtypes == 'object']

descMedal    object
dtype: object

In [239]:
# missing values

missing_columns = X_train.count()[X_train.count() < X_train.shape[0]].index.sort_values(ascending=True).to_list()
missing_flag = [
    'WinRate',
    'avg1Kill',
    'avg2Kill',
    'avg3Kill',
    'avg4Kill',
    'avg5Kill',
    'avgAssist',
    'avgBombeDefuse',
    'avgBombePlant',
    'avgClutchWon',
    'avgDamage',
    'avgDeath',
    'avgFirstKill',
    'avgFlashAssist',
    'avgHitChest',
    'avgHitHeadshot',
    'avgHitLeftAtm',
    'avgHitLeftLeg',
    'avgHitRightArm',
    'avgHitRightLeg',
    'avgHitStomach',
    'avgHits',
    'avgHs',
    'avgHsRate',
    'avgKill',
    'avgLastAlive',
    'avgPlusKill',
    'avgRoundsPlayed',
    'avgShots',
    'avgSurvived',
    'avgTk',
    'avgTkAssist',
    'avgTrade',
    'qtRecencia',
    'vlHsRate',
    'vlLevel',
]

missing_zero = [
    'avgTrade',
    'propAncient',
    'propDias00',
    'propDias01',
    'propDias02',
    'propDias03',
    'propDias04',
    'propDias05',
    'propDias06',
    'propDust2',
    'propInferno',
    'propMirage',
    'propNuke',
    'propOverpass',
    'propTrain',
    'propVertigo',
    'qtDias',
    'qtPartidas',
]

cat_features = X_train.dtypes[X_train.dtypes == 'object'].index.tolist()

In [240]:
print(cat_features)

['descMedal']


In [241]:
print(X_train.isna().sum())

avgHs             2375
avg3Kill          2375
propMirage        2375
qtMedal              0
avgFlashAssist    2375
                  ... 
descMedal            0
avg2Kill          2375
avgHits           2375
avgHitRightLeg    2375
avgHitStomach     2375
Length: 62, dtype: int64


## Modify

In [254]:
fe_missing_flag = imputation.ArbitraryNumberImputer(variables=missing_flag,
                                                    arbitrary_number=-100)

fe_missing_zero = imputation.ArbitraryNumberImputer(variables=missing_zero,
                                                    arbitrary_number=0)

fe_onehot = encoding.OneHotEncoder(variables=cat_features)

## Modeling

In [255]:
model = ensemble.RandomForestClassifier(min_samples_leaf=25, n_estimators=250)

model_pipeline = Pipeline([("Missing flag", fe_missing_flag),
                           ("Missing 0", fe_missing_zero),
                           ("OneHotEnconder", fe_onehot),
                           ("Classificador", model),
                           ])

model_pipeline.fit(X_train, y_train)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


In [250]:
y_predict_train = model_pipeline.predict(X_train)

acc_train = metrics.accuracy_score(y_train, y_predict_train)
print(f"Acurácia de treino: {acc_train}")

Acurácia de treino: 0.9290760101723651


In [251]:
y_predict_test = model_pipeline.predict(X_test)
y_probas = model_pipeline.predict_probas(X_test)

acc_test = metrics.accuracy_score(y_test, y_predict_test)
print(f"Acurácia de teste: {acc_test}")

Acurácia de teste: 0.8928571428571429


In [253]:
features_fit = model_pipeline[:-1].transform(X_train).columns.tolist()

features_importance = pd.Series(model.feature_importances_, index=features_fit)
features_importance.sort_values(ascending=False).head(15)

qtDiasExpericao     0.262012
qtDiasAssinatura    0.035062
avgShots            0.021049
avgHsRate           0.020366
propMirage          0.020085
avgBombePlant       0.017913
avgHitRightArm      0.017544
avgHitRightLeg      0.016938
avgTkAssist         0.016413
propVertigo         0.016389
avgHitHeadshot      0.016135
avgHitChest         0.016051
avgHitStomach       0.015839
propDust2           0.015648
avgSurvived         0.015538
dtype: float64

In [None]:
skplt.metrics.plot_roc(y_test, y_probas)

In [None]:
skplt.metrics.plot_ks_statistic(y_test, y_probas)

In [None]:
skplt.metrics.plot_cumulative_gain(y_test, y_probas)

In [None]:
skplt.metrics.plot_lift_curve(y_test, y_probas)