In [107]:
%matplotlib inline
import numpy as np
import pandas as pd
import string
from numpy.random import normal,random,seed
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import log_loss
from sklearn import metrics
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from scipy import stats
import pickle

import lightgbm as lgb
import xlearn as xl

In [3]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

In [4]:
Data = pd.read_csv("data/train.csv", dtype=dtypes)

In [14]:
Data["Wdft_RegionIdentifier"].isna().sum()

303451

In [28]:
Data[["AVProductStatesIdentifier", "CountryIdentifier"]].head()

Unnamed: 0,AVProductStatesIdentifier,CountryIdentifier
0,53447.0,29
1,53447.0,93
2,53447.0,86
3,53447.0,88
4,53447.0,18


In [154]:
def ChiSqTestForIndependence(col1, col2):

    contingency_table = pd.crosstab(
        Data[col1],
        Data[col2],
        margins = True
    )

    observed = contingency_table.iloc[0:contingency_table.shape[0]-1, 0: contingency_table.shape[1]-1]
    chi_stats = stats.chi2_contingency(observed= observed)[0:3]

    print(col1+", "+col2)
    print("Chi-Sq Stat: {}".format(chi_stats[0])+", p-value: {}".format(chi_stats[1])+", Deg of Freedom: {}".format(chi_stats[2]))


In [158]:
column_names = Data.columns
column_names = column_names.drop('MachineIdentifier')
column_names = column_names.drop('HasDetections')

In [159]:
column_names

Index(['ProductName', 'EngineVersion', 'AppVersion', 'AvSigVersion', 'IsBeta',
       'RtpStateBitfield', 'IsSxsPassiveMode', 'DefaultBrowsersIdentifier',
       'AVProductStatesIdentifier', 'AVProductsInstalled', 'AVProductsEnabled',
       'HasTpm', 'CountryIdentifier', 'CityIdentifier',
       'OrganizationIdentifier', 'GeoNameIdentifier',
       'LocaleEnglishNameIdentifier', 'Platform', 'Processor', 'OsVer',
       'OsBuild', 'OsSuite', 'OsPlatformSubRelease', 'OsBuildLab',
       'SkuEdition', 'IsProtected', 'AutoSampleOptIn', 'PuaMode', 'SMode',
       'IeVerIdentifier', 'SmartScreen', 'Firewall', 'UacLuaenable',
       'Census_MDC2FormFactor', 'Census_DeviceFamily',
       'Census_OEMNameIdentifier', 'Census_OEMModelIdentifier',
       'Census_ProcessorCoreCount', 'Census_ProcessorManufacturerIdentifier',
       'Census_ProcessorModelIdentifier', 'Census_ProcessorClass',
       'Census_PrimaryDiskTotalCapacity', 'Census_PrimaryDiskTypeName',
       'Census_SystemVolumeTotalCapa

In [161]:
for i in range(1,len(column_names)):
    for j in range(1,len(column_names)):
        if(j>i):
            ChiSqTestForIndependence(column_names[i], column_names[j])

EngineVersion, AppVersion
Chi-Sq Stat: 17848109.928673282, p-value: 0.0, Deg of Freedom: 7521
EngineVersion, AvSigVersion
Chi-Sq Stat: 433822720.8792209, p-value: 0.0, Deg of Freedom: 588570
EngineVersion, IsBeta
Chi-Sq Stat: 94.48103603948164, p-value: 0.022574910104172063, Deg of Freedom: 69
EngineVersion, RtpStateBitfield
Chi-Sq Stat: 28538.41250364159, p-value: 0.0, Deg of Freedom: 396
EngineVersion, IsSxsPassiveMode
Chi-Sq Stat: 2061.0284881334937, p-value: 0.0, Deg of Freedom: 69
EngineVersion, DefaultBrowsersIdentifier
Chi-Sq Stat: 1864825.754715597, p-value: 0.0, Deg of Freedom: 108927
EngineVersion, AVProductStatesIdentifier
Chi-Sq Stat: 8980044.118852444, p-value: 0.0, Deg of Freedom: 1998861
EngineVersion, AVProductsInstalled
Chi-Sq Stat: 146212.37296987115, p-value: 0.0, Deg of Freedom: 483
EngineVersion, AVProductsEnabled
Chi-Sq Stat: 24313.838997061554, p-value: 0.0, Deg of Freedom: 345
EngineVersion, HasTpm
Chi-Sq Stat: 5070.55643707623, p-value: 0.0, Deg of Freedom: 69


KeyboardInterrupt: 

In [None]:

expected =  np.outer(contingency_table["All"][:],
                     contingency_table.loc["All"][:]) / (contingency_table["All"].iloc[contingency_table.shape[0]-1])

expected = pd.DataFrame(expected)
expected.columns = np.asarray(contingency_table.columns)
expected.index = np.asarray(contingency_table.index)
expected = expected.iloc[0:expected.shape[0]-1, 0: expected.shape[1]-1]



In [113]:
crit = stats.chi2.ppf(q = 0.95, # Find the critical value for 95% confidence*
                      df = observed.shape[0]+observed.shape[1] - 1)   # *

print("Critical value")
print(crit)

p_value = 1 - stats.chi2.cdf(x=chi_squared_stat,  # Find the p-value
                             df=observed.shape[0]+observed.shape[1] - 1)
print("P value")
print(p_value)

Critical value
29589.5693535828
P value
0.0


In [156]:

class ChiSquare:
    def __init__(self, dataframe):
        self.df = dataframe
        self.p = None #P-Value
        self.chi2 = None #Chi Test Statistic
        self.dof = None
        
        self.dfObserved = None
        self.dfExpected = None
        
    def _print_chisquare_result(self, colX, alpha):
        result = ""
        if self.p<alpha:
            result="{0} is IMPORTANT for Prediction".format(colX)
        else:
            result="{0} is NOT an important predictor. (Discard {0} from model)".format(colX)

        print(result)
        
    def TestIndependence(self,colX,colY, alpha=0.05):
        X = self.df[colX].astype(str)
        Y = self.df[colY].astype(str)
        
        self.dfObserved = pd.crosstab(Y,X) 
        chi2, p, dof, expected = stats.chi2_contingency(self.dfObserved.values)
        self.p = p
        self.chi2 = chi2
        self.dof = dof 
        
        self.dfExpected = pd.DataFrame(expected, columns=self.dfObserved.columns, index = self.dfObserved.index)
        
        self._print_chisquare_result(colX,alpha)



In [160]:
#Initialize ChiSquare Class
cT = ChiSquare(Data)

#Feature Selection
for var in column_names:
    cT.TestIndependence(colX=var,colY="HasDetections" ) 

ProductName is IMPORTANT for Prediction
EngineVersion is IMPORTANT for Prediction
AppVersion is IMPORTANT for Prediction
AvSigVersion is IMPORTANT for Prediction
IsBeta is NOT an important predictor. (Discard IsBeta from model)
RtpStateBitfield is IMPORTANT for Prediction
IsSxsPassiveMode is IMPORTANT for Prediction
DefaultBrowsersIdentifier is IMPORTANT for Prediction
AVProductStatesIdentifier is IMPORTANT for Prediction
AVProductsInstalled is IMPORTANT for Prediction
AVProductsEnabled is IMPORTANT for Prediction
HasTpm is IMPORTANT for Prediction
CountryIdentifier is IMPORTANT for Prediction
CityIdentifier is IMPORTANT for Prediction
OrganizationIdentifier is IMPORTANT for Prediction
GeoNameIdentifier is IMPORTANT for Prediction
LocaleEnglishNameIdentifier is IMPORTANT for Prediction
Platform is IMPORTANT for Prediction
Processor is IMPORTANT for Prediction
OsVer is IMPORTANT for Prediction
OsBuild is IMPORTANT for Prediction
OsSuite is IMPORTANT for Prediction
OsPlatformSubRelease i