In [1]:
%matplotlib inline

In [15]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import itertools
import sklearn.feature_selection as sfs
from collections import defaultdict
from SafeTransformer import SafeTransformer

from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, make_scorer

from tqdm import tqdm_notebook as tqdm

In [3]:
seed = 20

In [4]:
df = pd.read_csv('../data/artificial_train.data', sep=' ', header=None).drop(500, axis=1)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,485,477,537,479,452,471,491,476,475,473,...,477,481,477,485,511,485,481,479,475,496
1,483,458,460,487,587,475,526,479,485,469,...,463,478,487,338,513,486,483,492,510,517
2,487,542,499,468,448,471,442,478,480,477,...,487,481,492,650,506,501,480,489,499,498
3,480,491,510,485,495,472,417,474,502,476,...,491,480,474,572,454,469,475,482,494,461
4,484,502,528,489,466,481,402,478,487,468,...,488,479,452,435,486,508,481,504,495,511


In [5]:
labels = pd.read_csv('../data/artificial_train.labels', header=None)[0]
labels.head()

0   -1
1   -1
2   -1
3    1
4    1
Name: 0, dtype: int64

In [30]:
from collections import Counter

c = Counter()
for i in tqdm(range(50), total=50):
    p1 = set(df.columns[sfs.chi2(df, labels)[1] < 0.05].values)
    p2 = set(df.columns[sfs.f_classif(df, labels)[1] < 0.05].values)
    x = sfs.mutual_info_classif(df, labels)
    p3 = set(np.argsort(x)[:-50:-1])
    important_cols = p1.intersection(p2).intersection(p3)
    c.update(important_cols)
    
c.most_common()

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




[(241, 50),
 (336, 49),
 (338, 49),
 (475, 46),
 (105, 45),
 (128, 44),
 (472, 25),
 (442, 20),
 (329, 15),
 (298, 13),
 (481, 7),
 (199, 6),
 (64, 5),
 (119, 5),
 (296, 5),
 (10, 4),
 (55, 4),
 (204, 4),
 (48, 4),
 (431, 4),
 (286, 3),
 (430, 3),
 (494, 2),
 (453, 1),
 (378, 1),
 (411, 1)]

In [25]:
cols = [241, 336, 338, 475, 105, 128, 472, 442]
params = {
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 10, 50, 100],
    'n_estimators': [20, 100, 300, 500, 1000],
    'n_jobs': [-1],
    'random_state': [seed],
}
search = GridSearchCV(model, params, cv=5, n_jobs=-1, scoring=make_scorer(balanced_accuracy_score))
search.fit(df[cols], labels)
search.best_params_, search.best_score_

({'criterion': 'entropy',
  'min_samples_split': 2,
  'n_estimators': 100,
  'n_jobs': -1,
  'random_state': 20},
 0.808)

## Find best predictive doubles 

In [42]:
a = [[48, 128, 153, 241, 451, 472, 493], 
     [48, 64, 128, 153, 241, 338, 451, 472, 493],
     [48, 64, 128, 153, 241, 338, 451, 455, 472, 493],
     [48, 64, 128, 153, 241, 451, 472, 493],
     [64, 128, 451, 472, 153, 475, 493, 48],
     [475,  48, 323, 424, 296, 204, 496, 431, 211, 481, 282,  10, 430, 53, 205, 248, 384, 377, 425, 226],
     [28, 318, 451],
     [48, 378],
     [64, 336],
     [105, 128],
     [153, 281, 433],
     [241, 475],
     [442, 472],
     [453, 493]]

# [64, 128, 451, 472, 153, 475, 493, 48]

a.append(list(c.keys()))
f_set = set()
for x in a:
    f_set = f_set.union(x)
    
print(len(f_set))

46


[64, 128, 451, 472, 153, 475, 493, 48]
chosen = set()

In [44]:
all_scores = []

model = RandomForestClassifier()
best_score, best_params, best_comb = 0, None, None

for comb in tqdm(list(itertools.combinations(f_set, 2))):
    params = {
        'criterion': ['gini', 'entropy'],
        'min_samples_split': [2, 10, 50, 100],
        'n_estimators': [20, 100, 300, 1000],
        'n_jobs': [-1],
        'random_state': [seed],
    }
    search = GridSearchCV(model, params, cv=2, n_jobs=-1)
    t_df = df[list(comb)]
    search.fit(t_df, labels)
    if search.best_score_ > best_score:
        best_score = search.best_score_
        best_params = search.best_params_
        best_comb = comb
    all_scores.append((comb, search.best_score_))

print(best_score)
print(best_params)
print(best_comb)

HBox(children=(IntProgress(value=0, max=1035), HTML(value='')))

0.679
{'criterion': 'entropy', 'min_samples_split': 50, 'n_estimators': 300, 'n_jobs': -1, 'random_state': 20}
(338, 241)


In [59]:
print(set(np.squeeze(np.array([[pair[0], pair[1]] for pair, score in all_scores if score > 0.63])).flatten()))

{128, 64, 453, 281, 105, 493, 378, 48, 241, 338, 433, 336, 472, 153, 442, 475, 28}


Add to best selection in notebook 1: (451, 378, 64, 128, 281, 241, 442, 493)
338

## Final set selection
#### Lets find the best subset

In [6]:
final_columns = [451, 378, 64, 128, 281, 241, 442, 493, 338]
all_scores = defaultdict(list)

model = RandomForestClassifier()
best_score, best_params, best_comb = 0, None, None

for i in tqdm(range(5, len(final_columns))):
    for comb in itertools.combinations(final_columns, i):
        params = {
            'criterion': ['gini', 'entropy'],
            'min_samples_split': [2, 10, 50, 100],
            'n_estimators': [20, 100, 300, 1000],
            'n_jobs': [-1],
            'random_state': [seed],
        }
        search = GridSearchCV(model, params, cv=5, n_jobs=-1)
        t_df = df[list(comb)]
        search.fit(t_df, labels)
        if search.best_score_ > best_score:
            best_score = search.best_score_
            best_params = search.best_params_
            best_comb = comb
        all_scores[i].append((comb, search.best_score_))

print(best_score)
print(best_params)
print(best_comb)

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))


0.893
{'criterion': 'gini', 'min_samples_split': 2, 'n_estimators': 300, 'n_jobs': -1, 'random_state': 20}
(451, 378, 64, 128, 281, 241, 493, 338)


In [7]:
for i in all_scores:
    comb, sc = max(all_scores[i], key=lambda x: x[1])
    print(i, comb, sc)

5 (451, 378, 128, 281, 241) 0.8775
6 (451, 378, 64, 281, 241, 493) 0.887
7 (451, 378, 64, 128, 281, 241, 493) 0.8895
8 (451, 378, 64, 128, 281, 241, 493, 338) 0.893


In [21]:
chosen_set = [451, 378, 64, 128, 281, 241, 493, 338]
model = RandomForestClassifier(**{'criterion': 'gini', 'min_samples_split': 2, 'n_estimators': 300, 'n_jobs': -1, 'random_state': 20})
t_df = df[chosen_set]
sc = cross_validate(model, t_df, labels, cv=5, scoring=make_scorer(balanced_accuracy_score), return_train_score=False)
np.mean(sc['test_score'])

0.893

In [8]:
params = {
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 10, 50, 100],
    'n_estimators': [20, 100, 300, 500, 1000],
    'n_jobs': [-1],
    'random_state': [seed],
}
search = GridSearchCV(model, params, cv=5, n_jobs=-1)
t_df = df[final_columns]
search.fit(t_df, labels)
search.best_score_, search.best_params_

{'criterion': 'entropy',
 'min_samples_split': 2,
 'n_estimators': 300,
 'n_jobs': -1,
 'random_state': 20}

In [22]:
cols = [451, 378, 64, 128, 281, 241, 493, 338]
square_df = df[final_columns].copy()
for col in cols:
    new_col = str(col) + '^2'
    new_vals = square_df[col].apply(lambda x: x**2)
    square_df[new_col] = new_vals
    
square_df.columns

Index([    451,     378,      64,     128,     281,     241,     442,     493,
           338, '451^2', '378^2',  '64^2', '128^2', '281^2', '241^2', '493^2',
       '338^2'],
      dtype='object')

In [16]:
params = {
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 10, 50, 100],
    'n_estimators': [20, 100, 300, 500, 1000],
    'n_jobs': [-1],
    'random_state': [seed],
}
search = GridSearchCV(model, params, cv=5, n_jobs=-1, scoring=make_scorer(balanced_accuracy_score))
search.fit(square_df, labels)
search.best_params_, search.best_score_

({'criterion': 'gini',
  'min_samples_split': 2,
  'n_estimators': 500,
  'n_jobs': -1,
  'random_state': 20},
 0.8935)

In [23]:
model = RandomForestClassifier(**{'criterion': 'gini', 'min_samples_split': 2, 'n_estimators': 500, 'n_jobs': -1, 'random_state': 20})
sc = cross_validate(model, square_df, labels, cv=5, scoring=make_scorer(balanced_accuracy_score), return_train_score=False)
np.mean(sc['test_score'])

0.8935000000000001