In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tnrange, tqdm_notebook
import gc

In [2]:
sns.set_context('talk')

# Read the data

In [41]:
dfXtrain = pd.read_csv('preprocessed_csv/train.csv', index_col='id')
dfXtest = pd.read_csv('preprocessed_csv/test.csv', index_col='id')
dfYtrain = pd.read_csv('preprocessed_csv/y_train.csv', header=None, names=['id', 'proba'], index_col='id')

# Saving routines

In [168]:
dfYtest = pd.DataFrame({'INDEX': dfXtest.index, 'P_TARGET_FLAG': 0})

In [169]:
def save_to_file(y, file_name):
    dfYtest['P_TARGET_FLAG'] = y
    dfYtest.to_csv('results/{}'.format(file_name), index=False)

# Ideas

In [62]:
anomaly = {'car_age': 1, 'income': 0, 'home_val': 0, 'yoj': 0}

def return_index(df, col, mask):
    if mask == 0:
        return (df[col] == anomaly[col])
    else:
        return (df[col] != anomaly[col])     

In [63]:
def compute_indices(car_age, income, home_val, yoj, df):
    return (return_index(df, 'income', income) & return_index(df, 'home_val', home_val)) & \
           (return_index(df, 'yoj', yoj) & return_index(df, 'car_age', car_age)) 

In [82]:
COMP_NUM = 16

x_train = [0 for i in range(COMP_NUM)]
x_test = [0 for i in range(COMP_NUM)]
y_train = [0 for i in range(COMP_NUM)]
dfYtest = [0 for i in range(COMP_NUM)]

for car_age in range(2):
    for income in range(2):
        for home_val in range(2):
            for yoj in range(2):
                mask = car_age * 8 + income * 4 + home_val * 2 + yoj
                train_indices = compute_indices(car_age, income, home_val, yoj, dfXtrain)
                test_indices = compute_indices(car_age, income, home_val, yoj, dfXtest)
                x_train[mask] = np.array(dfXtrain[train_indices])
                y_train[mask] = np.array(dfYtrain[train_indices]['proba'])
                x_test[mask] = np.array(dfXtest[test_indices])
                dfYtest[mask] = pd.DataFrame({'INDEX': dfXtest.index})

In [83]:
for mask in range(COMP_NUM):
    print(x_train[mask].shape[0], '{:.2}'.format(y_train[mask].sum() / x_train[mask].shape[0]))

162 0.44
38 0.42
100 0.38
73 0.27
33 0.33
633 0.38
72 0.28
1333 0.26
244 0.45
78 0.26
180 0.29
185 0.17
86 0.34
1484 0.31
202 0.18
3258 0.2


Некоторые выборки слишком малого размера. Обучать лес на 33 объектах с 40 признаками кажется не самой лучшей идеей

**Идеи**:
* попробовать деление на 16 непересекающихся частей
* выбрать из четырёх признаков меньшее количество, чтобы объём получающихся микровыборок был больше
* делить на восемь пересекающихся частей (по две для каждого признака). Восемь классификаторов, ответ -- среднее

Доля положительного класса среди обучающей выборки

In [80]:
dfYtrain.proba.sum()/dfXtrain.shape[0]

0.2638157088592084

# Train RF

In [26]:
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import train_test_split

Классификатор, который позволяет создавать разные леса на разных подвыборках

In [114]:
import itertools

class AnomalyClassifier():
    def __init__(self, Classifier, anomaly, intersected=False, clf_kwargs=None):
        self.Classifier = Classifier
        self.anomaly = anomaly
        self.intersected = intersected
        
        if intersected:
            self.subset_count = 2 * len(anomaly)
        else:
            self.subset_count = 2 ** len(anomaly)
            
        if type(clf_kwargs) is list:
            self.clf_kwargs = clf_kwargs
        else:
            self.clf_kwargs = [clf_kwargs for mask in range(self.subset_count)]
 

    def return_index(self, df, col, mask):
        if mask == 0:
            return (df[col] == self.anomaly[col])
        else:
            return (df[col] != self.anomaly[col])

        
    def compute_indices(self, df, col_mask_iterator):
        result = True
        for col, mask in col_mask_iterator:
            result = result & self.return_index(df, col, mask)
        return result
        
    
    def indices_generator(self, df):
        if self.intersected:
            for col in sorted(self.anomaly):
                for mask in range(2):
                    yield self.return_index(df, col, mask)
        else:
            for mask in itertools.product(range(2), repeat=len(self.anomaly)):
                yield self.compute_indices(df, zip(sorted(self.anomaly), mask))
    
    
    def fit(self, dfXtrain, dfYtrain):
        self.clfs = [self.Classifier(**self.clf_kwargs[mask]).fit(
                            np.array(dfXtrain[indices]), 
                            np.array(dfYtrain[indices].proba))
                     for mask, indices in enumerate(self.indices_generator(dfXtrain))]
        return self
    
    
    def predict_proba(self, dfXtest):
        dfYtest = pd.DataFrame({'proba': 0}, index=dfXtest.index)
        for mask, indices in enumerate(self.indices_generator(dfXtest)):
            dfYtest.loc[indices, 'proba'] += self.clfs[mask].predict_proba(np.array(dfXtest[indices]))[:, 1]
        if self.intersected:
            dfYtest.proba /= self.subset_count / 2
        return dfYtest
    

In [126]:
x_subtrain, x_subtest, y_subtrain, y_subtest = train_test_split(dfXtrain, dfYtrain, 
                                                                test_size=0.15, random_state=42,
                                                                stratify=dfYtrain.proba)

16 частей

In [127]:
%%time

kwargs = {'n_estimators': 1000, 'criterion': 'entropy', 'n_jobs': -1, 'random_state': 100}

clf = AnomalyClassifier(RFC, anomaly, intersected=False, clf_kwargs=kwargs)
clf.fit(x_subtrain, y_subtrain)
score = roc_auc_score(y_subtest.proba, clf.predict_proba(x_subtest))
print(score)

0.802415341209
CPU times: user 1min 5s, sys: 3.54 s, total: 1min 9s
Wall time: 48.3 s


8 с пересечениями

In [128]:
%%time

kwargs = {'n_estimators': 1000, 'criterion': 'entropy', 'n_jobs': -1, 'random_state': 100}

clf = AnomalyClassifier(RFC, anomaly, intersected=True, clf_kwargs=kwargs)
clf.fit(x_subtrain, y_subtrain)
score = roc_auc_score(y_subtest.proba, clf.predict_proba(x_subtest))
print(score)

0.818209276942
CPU times: user 1min 46s, sys: 1.41 s, total: 1min 47s
Wall time: 39.4 s


Поперебираем подможества признаков

In [130]:
anomaly = {'car_age': 1, 'income': 0, 'home_val': 0, 'yoj': 0}

In [151]:
def subset_generator(set_dict):
    for mask in itertools.product(range(2), repeat=len(anomaly)):
        result = {}
        for index, key in enumerate(sorted(set_dict)):
            if mask[index]:
                result[key] = set_dict[key]
        if result:
            yield result

In [155]:
%%time

kwargs = {'n_estimators': 1000, 'criterion': 'entropy', 'n_jobs': -1, 'random_state': 100}

score = np.zeros((2 ** len(anomaly) - 1, 2))

for first, subset in enumerate(subset_generator(anomaly)):
    for second, intersected in enumerate((True, False)):
        clf = AnomalyClassifier(RFC, subset, intersected=intersected, clf_kwargs=kwargs)
        clf.fit(x_subtrain, y_subtrain)
        score[first, second] = roc_auc_score(y_subtest.proba, clf.predict_proba(x_subtest))
        print(score[first, second])
        del clf
        gc.collect()

0.812693498452
0.812693498452
0.815355282036
0.815355282036
0.815068681224
0.810445312446
0.816784853748
0.816784853748
0.817061157524
0.809508282249
0.818171521147
0.815887295518
0.817385514131
0.81343316881
0.813769538624
0.813769538624
0.816223665333
0.806589072786
0.817622345939
0.811574553967
0.817093764802
0.806338511598
0.818322544329
0.812070527826
0.817630926802
0.804493626135
0.818899178297
0.807409403252
0.818209276942
0.802415341209
CPU times: user 23min 19s, sys: 27.9 s, total: 23min 47s
Wall time: 9min 49s


In [166]:
gc.collect()

2485

In [161]:
np.unravel_index(np.argmax(score), score.shape)

(13, 0)

In [163]:
list(subset_generator(anomaly))[13]

{'car_age': 1, 'home_val': 0, 'income': 0}

In [164]:
subset = _

In [167]:
subset

{'car_age': 1, 'home_val': 0, 'income': 0}

In [170]:
%%time

kwargs = {'n_estimators': 3000, 'criterion': 'entropy', 'n_jobs': -1, 'random_state': 100}

clf = AnomalyClassifier(RFC, subset, intersected=True, clf_kwargs=kwargs)
clf.fit(dfXtrain, dfYtrain)
y_test = clf.predict_proba(dfXtest)
save_to_file(y_test.proba, 'subset_3_3000.csv')

CPU times: user 4min 31s, sys: 4.1 s, total: 4min 35s
Wall time: 4min 38s


In [171]:
y_test.head()

Unnamed: 0_level_0,proba
id,Unnamed: 1_level_1
3,0.178111
9,0.397778
10,0.087889
18,0.181222
21,0.229778


In [173]:
save_to_file(np.array(y_test.proba), 'subset_3_3000.csv')

In [175]:
%%time

kwargs = {'n_estimators': 3000, 'criterion': 'entropy', 'n_jobs': -1, 'random_state': 100}

clf = AnomalyClassifier(RFC, anomaly, intersected=True, clf_kwargs=kwargs)
clf.fit(dfXtrain, dfYtrain)
y_test = clf.predict_proba(dfXtest)
save_to_file(np.array(y_test.proba), 'subset_4_3000.csv')

CPU times: user 5min 59s, sys: 10.2 s, total: 6min 9s
Wall time: 15min 59s


In [176]:
del clf
gc.collect()

31