In [1]:
import logging
import datetime
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import getpass
import seaborn as sns
import copy
from schema_buddy import SchemaBuddy
from custom_pipelines import PdFeatureUnion, Pandify, SelectCols, DropCols, TruncOrdinalFreqEncoder, OHE, CategoryFrequency

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, FunctionTransformer, QuantileTransformer, RobustScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score, precision_recall_curve, auc, roc_auc_score, confusion_matrix
from pandas_profiling import ProfileReport
from janitor import transform_column, rename_columns, filter_column_isin, select_columns, drop_constant_columns

pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
matplotlib.use_backend()

In [2]:
df = pd.read_csv("./data/GameStats.csv")
df.head()

In [4]:
# schema buddy configuration file
conf = {
    "taget_col": "IsWin",
    "vtype_override": {}
}

In [5]:
schema = SchemaBuddy(df, conf)

In [6]:
schema.get_styled_variables_summary()

Unnamed: 0,vtype,is_unique,n_missing,n_distinct,p_distinct,p_zeros,p_negative,vtype_override,dtype
IsWin,boolean,False,0.0,2.0,0.0,.,.,False,boolean
Date,categorical,False,0.0,6770.0,0.994,.,.,False,string
Level,categorical,False,0.0,19.0,0.003,.,.,False,string
NumBlocks,numeric,False,0.0,18.0,0.003,0.000,0.000,False,Int64
ElapsedTime,numeric,False,0.0,6789.0,0.996,0.000,0.000,False,Float64
Score,numeric,False,0.0,176.0,0.026,0.041,0.000,False,Int64
Accuracy,numeric,False,0.0,6778.0,0.995,0.002,0.000,False,Float64


# EDA by variable type

In [9]:
# look at linear correlation of numerical features
bookkeeper = schema.get_bookkeeper()

use_cols = bookkeeper.pop_vtype("numeric")

plt.figure(figsize=(30,15))
corr = df[use_cols].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr.round(2), annot=True, mask=mask)
plt.xticks(rotation=45, ha="right")
plt.show()

  plt.show()


In [19]:
def get_feature_preprocessor(schema):
    # TODO features indicating if value was imputed
    bookkeeper = schema.get_bookkeeper()
    
    drop_list = [
        "beta",
        "severity",
        "category",
        "confidence",
        "sub_category",
        "started_time",
        "fk_participant_id",
        "flows_dst_port",
        "flows_app_domains",
        "flows_app_domains_domain_with_suffix",
        "flows_domain_registered_period",
        "flows_domain_threat_profile",  # highly correlated
        "flows_domain_threat_phishing", # highly correlated
        "flows_domain_threat_proximity", # highly correlated
        "flows_threat_intel_domain_match",
        "flows_threat_intel_ip_match",        
    ] + rank_features
    
    num_passthrough_list = [
        "flows_tld_domain_norm_indegree",
        "flows_domain_norm_indegree",
    ]
    
    bookkeeper.pop_cols(drop_list)
    
    preprocessor = Pipeline([
        ('union', PdFeatureUnion([
            ('num_pass', Pipeline([
                ('select', SelectCols(bookkeeper.pop_cols(num_passthrough_list), strict=False)),
                ('impute', Pandify(SimpleImputer(strategy="constant", fill_value=0.0)))
            ])),
            ('num', Pipeline([
                ('select', SelectCols(bookkeeper.pop_vtype("numeric"), strict=False)),
                ('scale', Pandify(QuantileTransformer())),
                ('impute', Pandify(SimpleImputer(strategy="constant", fill_value=0.0)))
            ])),
            ('bool', Pipeline([
                ('select', SelectCols(bookkeeper.pop_vtype("boolean"), strict=False)),
                ('bool2float', Pandify(FunctionTransformer(lambda x: pd.DataFrame(x).astype(float)))),
                ('impute', Pandify(SimpleImputer(strategy="constant", fill_value=0.0)))
            ])),
            ('cat', Pipeline([
                ('select', SelectCols(bookkeeper.pop_vtype("categorical"), strict=False)),
                ('cat2freq', CategoryFrequency(use_proportion=True)),
                ('impute', Pandify(SimpleImputer(strategy="constant", fill_value=0.0))),
            ])),
        ])), # close feature union
    ]) # close pipeline
    
    bookkeeper.check()
    
    return preprocessor

In [None]:
X, y = get_xy(events_df, "y_is_malicious")
preprocessor = get_feature_preprocessor(schema)
cv = StratifiedShuffleSplit(n_splits=5, random_state=42)

In [22]:
X, y = get_xy(events_df, "y_is_malicious")
preprocessor = get_feature_preprocessor(schema)
cv = StratifiedShuffleSplit(n_splits=5, random_state=42)
models["LR_l1"] = LogisticRegression(penalty="l1", max_iter=1000, solver="saga")

All cols accounted
