In [21]:
import numpy as np 
import pandas as pd
import lightgbm as lgb

## Import data

In [2]:
df_test = pd.read_csv('test.csv')

In [3]:
df_train = pd.read_csv('train.csv')

In [4]:
df_test.shape

(200000, 201)

In [5]:
test_id = df_test['ID_code']

## Find fake test data

In [6]:
for var in df_test.columns[1:]:
    dict_count = df_test[var].value_counts().to_dict()
    new_col = var + '_unique'
    df_test[new_col] = df_test[var].apply(lambda s: 1 if dict_count[s]==1 else 0)
df_test['has_unique'] = np.sum(df_test.iloc[:,201:],axis=1).apply(lambda s: 1 if s>0 else 0)

  after removing the cwd from sys.path.
  """


In [7]:
df_test['has_unique'].value_counts()

0    100000
1    100000
Name: has_unique, dtype: int64

* In test data, half of them are fake data, half are real.

In [8]:
real_test = df_test[df_test['has_unique']==1].iloc[:,:201]
fake_test = df_test[df_test['has_unique']==0].iloc[:,:201]

## Combine train and read test data, then add new frequency features

In [9]:
train_realtest = pd.concat([df_train, real_test], axis=0)

In [10]:
for var in train_realtest.columns[2:]:
    dict_count = train_realtest[var].value_counts().to_dict()
    new_col = var + '_unique'
    train_realtest[new_col] = train_realtest[var].apply(lambda s: 0 if dict_count[s]>1 else 1)
    #train_realtest[new_col] = train_realtest[var].apply(lambda s: dict_count[s])
    #train_realtest[var] = train_realtest[var].apply(lambda s: None if dict_count[s]>1 else s)

  after removing the cwd from sys.path.


In [11]:
train_realtest

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190_unique,var_191_unique,var_192_unique,var_193_unique,var_194_unique,var_195_unique,var_196_unique,var_197_unique,var_198_unique,var_199_unique
0,train_0,0.0,8.9255,-6.7863,11.9081,5.0930,11.4607,-9.2834,5.1187,18.6266,...,0,0,0,0,0,0,0,0,0,0
1,train_1,0.0,11.5006,-4.1473,13.8588,5.3890,12.3622,7.0433,5.6208,16.5338,...,0,0,0,0,0,0,0,0,0,0
2,train_2,0.0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,0,0,0,0,0,0,0,0,0,0
3,train_3,0.0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.9250,...,0,0,0,0,0,0,0,0,0,0
4,train_4,0.0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,0,0,1,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199986,test_199986,,19.2884,-2.8384,11.9149,6.6611,12.3112,12.9244,5.6492,16.0449,...,0,0,0,0,0,0,0,0,0,0
199993,test_199993,,14.6764,-8.1066,7.1167,2.4138,10.3845,-11.9327,4.7563,16.0455,...,0,0,0,0,0,0,0,0,0,1
199995,test_199995,,13.1678,1.0136,10.4333,6.7997,8.5974,-4.1641,4.8579,14.7625,...,1,0,0,0,1,0,0,0,0,1
199996,test_199996,,9.7171,-9.1462,7.3443,9.1421,12.8936,3.0191,5.6888,18.8862,...,1,0,0,0,0,0,0,0,0,1


## Split train and real test data

In [12]:
train_new = train_realtest[train_realtest['ID_code'].str.contains('train')]

In [13]:
test_new = train_realtest[train_realtest['ID_code'].str.contains('test')]

## Build 200 models

* each time use one original feature and corresponding frequency feature to build a model
* then use logit transfer the prediction of probability got from one model and sum up these result from 200 models
* then get mean of these result from 200 models and use exp to transfer into probability

* Actually I'm not sure why they use logit here, but I just follow that. And to get probability instead of large negative number, I used exp to transfer and the score looks good.

In [14]:
from scipy.special import logit, expit

In [15]:
X_train_all = train_new.drop(columns=['ID_code', 'target'])
y_train_all = train_new['target']

In [17]:
X_df_test = df_test.drop(columns=['ID_code','has_unique'])

In [24]:
#features = [x for x in X_train.columns if x.startswith("var")]
features = X_train_all.columns[0:200].to_list()

pred = 0
for var in features:
    model = lgb.LGBMClassifier(**{'learning_rate': 0.05, 
                                  'max_bin': 165, 
                                  'max_depth': 5, 
                                  'min_child_samples': 150,
                                  'min_child_weight': 0.1, 
                                  'min_split_gain': 0.0018, 
                                  'n_estimators': 41,
                                  'num_leaves': 6, 
                                  'reg_alpha': 2.0, 
                                  'reg_lambda': 2.54, 
                                  'objective': 'binary', 
                                  'n_jobs': -1})
    var_count_name = var + '_unique'
    model = model.fit(np.hstack([X_train_all[var].values.reshape(-1, 1),
                      X_train_all[var_count_name].values.reshape(-1, 1)]), y_train_all.values)
    pred += logit(model.predict_proba(np.hstack([X_df_test[var].values.reshape(-1, 1),
                  X_df_test[var_count_name].values.reshape(-1, 1)]))[:, 1])
    
#pd.DataFrame({"ID_code": test_id, "target": pred}).to_csv("submission3.csv", index = False)
b = pd.DataFrame({"ID_code": df_test['ID_code'], "target": pred})

In [25]:
b['target'] = np.exp(b['target']/200)

In [None]:
b.to_csv("submission8.csv", index = False) 

In [None]:
#0.884: submission4 -- adding frequency 
#0.887: submission8 -- adding frequency and change frequency>1 into 0
#0.85: submission5 -- adding frequency and change unique frequency into None
#0.87 submission6 -- adding frequency and change non-unique frequency into None
#0.77 submission7 -- change original variable nonunique into Nonea