In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [2]:
train_data = pd.read_csv('train_all.csv', nrows = 10000)
test_data = pd.read_csv('test_all.csv', nrows = 100)

In [3]:
features_columns = [col for col in train_data.columns if col not in ['user_id','label']]
train = train_data[features_columns].values
test = test_data[features_columns].values
target =train_data['label'].values

In [4]:
# using mean value to replace missing values
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy = 'mean')
imputer = imputer.fit(train)
train_imputer = imputer.transform(train)
test_imputer = imputer.transform(test)

In [5]:
# feature selection
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

def feature_selection(train, train_sel, target):
    clf = RandomForestClassifier(n_estimators = 100, max_depth = 2, random_state = 0, n_jobs = -1)
    
    scores = cross_val_score(clf, train, target, cv = 5)
    scores_sel = cross_val_score(clf, train_sel, target, cv = 5)
    
    print("No Select Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))     
    print("Features Select Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [6]:
# delete low variance features
from sklearn.feature_selection import VarianceThreshold

sel = VarianceThreshold(threshold = (.8 * (1 - .8)))
sel = sel.fit(train)
train_sel = sel.transform(train)
test_sel = sel.transform(test)
print('Before feature selection', train.shape)
print('After feature selection', train_sel.shape)

Before feature selection (2000, 229)
After feature selection (2000, 25)


In [7]:
feature_selection(train, train_sel, target)

No Select Accuracy: 0.93 (+/- 0.00)
Features Select Accuracy: 0.93 (+/- 0.00)


In [8]:
# SelectKBest
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

sel = SelectKBest(mutual_info_classif, k = 2)
sel = sel.fit(train, target)
train_sel = sel.transform(train)
test_sel = sel.transform(test)
print('Before feature selection', train.shape)
print('After feature selection', train_sel.shape)

Before feature selection (2000, 229)
After feature selection (2000, 2)


In [9]:
sel = SelectKBest(mutual_info_classif, k = 10)
sel = sel.fit(train, target)
train_sel = sel.transform(train)
test_sel = sel.transform(test)
print('Before feature selection', train.shape)
print('After feature selection', train_sel.shape)

Before feature selection (2000, 229)
After feature selection (2000, 10)


In [10]:
feature_selection(train, train_sel, target)

No Select Accuracy: 0.93 (+/- 0.00)
Features Select Accuracy: 0.93 (+/- 0.00)


In [11]:
# recursive, RFECV
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators = 10, max_depth = 2, random_state = 0, n_jobs = -1)
selector = RFECV(clf, step = 1, cv = 2)
selector = selector.fit(train, target)
print(selector.support_)
print(selector.ranking_)

[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False  True False False False  True False False False False False  True
 False False False False  True False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False Fa

In [12]:
# SelectFromModel, l2
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Normalizer

normalizer = Normalizer()
normalizer = normalizer.fit(train)

train_norm = normalizer.transform(train)                        
test_norm = normalizer.transform(test)

LR = LogisticRegression(penalty = 'l2', C = 5)
LR = LR.fit(train_norm, target)
model = SelectFromModel(LR, prefit = True)
train_sel = model.transform(train)
test_sel = model.transform(test)
print('Before feature selection', train.shape)
print('After feature selection', train_sel.shape)

Before feature selection (2000, 229)
After feature selection (2000, 19)


In [13]:
LR.coef_[0][:10]

array([ 0.27519559, -0.02736298, -0.0052266 ,  0.90644115, -0.43100179,
       -0.25111392, -0.40588933,  0.29059361,  0.10568269, -0.02731254])

In [14]:
feature_selection(train, train_sel, target)

No Select Accuracy: 0.93 (+/- 0.00)
Features Select Accuracy: 0.93 (+/- 0.00)


In [15]:
# based on tree models' feature importance
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(train, target)

model = SelectFromModel(clf, prefit=True)
train_sel = model.transform(train)
test_sel = model.transform(test)
print('Before feature selection', train.shape)
print('Before feature selection', train_sel.shape)

Before feature selection (2000, 229)
Before feature selection (2000, 67)


In [16]:
clf.feature_importances_[:10]

array([0.08935271, 0.01634579, 0.01091265, 0.01627903, 0.01381999,
       0.01336046, 0.01446831, 0.01515362, 0.01668204, 0.0073939 ])

In [17]:
df_features_import = pd.DataFrame()
df_features_import['features_import'] = clf.feature_importances_
df_features_import['features_name'] = features_columns

In [18]:
df_features_import.sort_values(['features_import'], ascending = 0).head(30)

Unnamed: 0,features_import,features_name
0,0.089353,merchant_id
228,0.082163,xgb_clf
227,0.074995,lgb_clf
20,0.020701,brand_most_1_cnt
18,0.017545,seller_most_1_cnt
14,0.016988,seller_most_1
12,0.016932,time_stamp_std
8,0.016682,time_stamp_nunique
21,0.016516,action_type_1_cnt
22,0.016431,user_cnt_0


In [19]:
feature_selection(train, train_sel, target)

No Select Accuracy: 0.93 (+/- 0.00)
Features Select Accuracy: 0.93 (+/- 0.00)


In [20]:
# lightgbm feature importance
import lightgbm
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train, target, test_size = 0.4, random_state = 0)

clf = lightgbm

train_matrix = clf.Dataset(X_train, label = y_train)
test_matrix = clf.Dataset(X_test, label = y_test)
params = {
          'boosting_type': 'gbdt',
          'objective': 'multiclass',
          'metric': 'multi_logloss',
          'min_child_weight': 1.5,
          'num_leaves': 2 ** 5,
          'lambda_l2': 10,
          'subsample': 0.7,
          'colsample_bytree': 0.7,
          'colsample_bylevel': 0.7,
          'learning_rate': 0.03,
          'tree_method': 'exact',
          'seed': 2017,
          "num_class": 2,
          'silent': True,
          }
num_round = 10000
early_stopping_rounds = 100
model = clf.train(params, 
                  train_matrix,
                  num_round,
                  valid_sets = test_matrix,
                  early_stopping_rounds = early_stopping_rounds)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6627
[LightGBM] [Info] Number of data points in the train set: 1200, number of used features: 124
[LightGBM] [Info] Start training from score -0.068100
[LightGBM] [Info] Start training from score -2.720629
[1]	valid_0's multi_logloss: 0.257311
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_logloss: 0.257207
[3]	valid_0's multi_logloss: 0.257206
[4]	valid_0's multi_logloss: 0.257058
[5]	valid_0's multi_logloss: 0.257099
[6]	valid_0's multi_logloss: 0.257186
[7]	valid_0's multi_logloss: 0.25711
[8]	valid_0's multi_logloss: 0.25729
[9]	valid_0's multi_logloss: 0.257274
[10]	valid_0's multi_logloss: 0.257255
[11]	valid_0's multi_logloss: 0.257242
[12]	valid_0's multi_logloss: 0.257399
[13]	valid_0's multi_logloss: 0.257462
[14]	valid_0's multi_logloss: 0.257411
[15]	valid_0's multi_logloss: 0.257708
[16]	valid_0's multi_logloss: 0.257676
[17]	valid_0's multi_logloss: 0.

[62]	valid_0's multi_logloss: 0.260214
[63]	valid_0's multi_logloss: 0.260323
[64]	valid_0's multi_logloss: 0.260542
[65]	valid_0's multi_logloss: 0.260581
[66]	valid_0's multi_logloss: 0.260687
[67]	valid_0's multi_logloss: 0.260703
[68]	valid_0's multi_logloss: 0.260915
[69]	valid_0's multi_logloss: 0.261177
[70]	valid_0's multi_logloss: 0.26126
[71]	valid_0's multi_logloss: 0.261315
[72]	valid_0's multi_logloss: 0.261418
[73]	valid_0's multi_logloss: 0.261392
[74]	valid_0's multi_logloss: 0.26148
[75]	valid_0's multi_logloss: 0.261745
[76]	valid_0's multi_logloss: 0.261671
[77]	valid_0's multi_logloss: 0.261726
[78]	valid_0's multi_logloss: 0.26182
[79]	valid_0's multi_logloss: 0.261688
[80]	valid_0's multi_logloss: 0.261742
[81]	valid_0's multi_logloss: 0.261835
[82]	valid_0's multi_logloss: 0.261955
[83]	valid_0's multi_logloss: 0.262051
[84]	valid_0's multi_logloss: 0.261904
[85]	valid_0's multi_logloss: 0.261905
[86]	valid_0's multi_logloss: 0.261851
[87]	valid_0's multi_logloss

In [21]:
def lgb_transform(train, test, model, topK):
    train_df = pd.DataFrame(train)
    train_df.columns = range(train.shape[1])
    
    test_df = pd.DataFrame(test)
    test_df.columns = range(test.shape[1])
    
    features_import = pd.DataFrame()
    features_import['importance'] = model.feature_importance()
    features_import['col'] = range(train.shape[1])
    
    features_import = features_import.sort_values(['importance'], ascending = 0).head(topK)
    sel_col = list(features_import.col)
    
    train_sel = train_df[sel_col]
    test_sel = test_df[sel_col]
    return train_sel, test_sel

In [22]:
train_sel, test_sel = lgb_transform(train, test, model, 20)
print('Before feature selection', train.shape)
print('After feature selection', train_sel.shape)

Before feature selection (2000, 229)
After feature selection (2000, 20)


In [23]:
model.feature_importance()[:10]

array([ 8,  4,  0, 12,  2, 12,  7,  8,  6,  0])

In [24]:
feature_selection(train, train_sel, target)

No Select Accuracy: 0.93 (+/- 0.00)
Features Select Accuracy: 0.93 (+/- 0.00)
