In [4]:
import pandas as pd
import matplotlib.pyplot as plt


%matplotlib inline
pd.options.mode.chained_assignment = None

SEED = 45

In [2]:
test = pd.read_csv('data/test.csv', usecols=['item_id'])
train = pd.read_csv('data/train.csv', usecols=['item_id', 'deal_probability'])

In [3]:
periods = pd.read_csv('predictions/periods.csv')
periods_train = pd.read_csv('predictions/periods_train.csv').rename(columns={'deal_probability': 'periods_predicted'})

In [4]:
activation_date = pd.read_csv('predictions/activation_date.csv')
activation_date_train = pd.read_csv('predictions/activation_date_train.csv').rename(columns={'deal_probability': 'activation_predicted'})

In [5]:
tabular = pd.read_csv('predictions/tabular.csv')
tabular_train = pd.read_csv('predictions/tabular_train.csv').rename(columns={'deal_probability': 'tabular_predicted'})

In [6]:
nlp = pd.read_csv('predictions/nlp.csv')
nlp_train = pd.read_csv('predictions/nlp_train.csv').rename(columns={'deal_probability': 'nlp_predicted'})

## Training the ensembling model

In [8]:
# train = train.merge(periods_train, how='left', on='item_id')
# train = train.merge(activation_date_train, how='left', on='item_id')
# train = train.merge(tabular_train, how='left', on='item_id')
# train = train.merge(nlp_train, how='left', on='item_id')

train = reduce(lambda left,right: pd.merge(left,right,on='item_id', how='left'), 
              [train, periods_train, activation_date_train, tabular_train, nlp_train])

508438

In [None]:
train.head()

In [9]:
test = reduce(lambda left,right: pd.merge(left,right,on='item_id', how='left'), 
              [test, periods, activation_date, tabular, nlp])

item_id                   0.0
periods_weight            0.0
tabular_weight            0.0
nlp_weight                0.0
activation_date_weight    0.0
dtype: float64

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train.drop('deal_probability', axis=1), 
                                                    train['deal_probability'], 
                                                    random_state=SEED)

In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from utils import config_dict
# model = RandomForestRegressor(n_estimators=200, random_state=SEED)
model = RandomizedSearchCV(estimator=RandomForestRegressor(), random_state=SEED,
                        param_distributions=config_dict['sklearn.ensemble.RandomForestRegressor'],
                        n_iter=25,
                        scoring='r2',
                        cv=5,
                        verbose=1,
                        n_jobs=4)

In [None]:
model.fit(X_train, y_train)

In [8]:
model.score(X_test, y_test)

In [None]:
test['deal_probability'] = model.predict(test.values)

In [None]:
test['deal_probability'].head()

In [None]:
submission.to_csv('submission.csv')

In [10]:
test = test.merge(periods, how='left', on='item_id')
null_idx = test['deal_probability'].isna()
test = test.rename(columns={'deal_probability': 'periods'})
test['periods_weight'][null_idx] = 0.0
test['periods'][null_idx] = 0.0

In [11]:
test = test.merge(tabular, how='left', on='item_id')
null_idx = test['deal_probability'].isna()
test = test.rename(columns={'deal_probability': 'tabular'})
test['tabular_weight'][null_idx] = 0.0
test['tabular'][null_idx] = 0.0

In [12]:
test = test.merge(nlp, how='left', on='item_id')
null_idx = test['deal_probability'].isna()
test = test.rename(columns={'deal_probability': 'nlp'})
test['nlp_weight'][null_idx] = 0.0
test['nlp'][null_idx] = 0.0

In [13]:
test = test.merge(activation_date, how='left', on='item_id')
null_idx = test['deal_probability'].isna()
test = test.rename(columns={'deal_probability': 'activation_date'})
test['activation_date_weight'][null_idx] = 0.0
test['activation_date'][null_idx] = 0.0

In [14]:
test['deal_probability'] = test['periods']*test['periods_weight'] + \
                            test['tabular']*test['tabular_weight'] + \
                            test['nlp']*test['nlp_weight'] + \
                            test['activation_date']*test['activation_date_weight'] 
test['deal_probability'] /= ( test['periods_weight'] + test['tabular_weight'] + test['nlp_weight'] + test['activation_date_weight'])

In [15]:
submission = test[['item_id', 'deal_probability']].set_index('item_id')

In [16]:
### methods for transformation to [0, 1] range:
### 1) just cut out the negative part:
submission[submission < 0] = 0.0

### 2) MinMaxScaler to [0,1] range
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# submission['deal_probability'] = scaler.fit_transform(submission)

submission.min()

deal_probability    0.001718
dtype: float64

In [17]:
submission.to_csv('submission.csv')

In [18]:
len(submission)

508438

In [19]:
submission.head()

Unnamed: 0_level_0,deal_probability
item_id,Unnamed: 1_level_1
6544e41a8817,0.053102
65b9484d670f,0.360651
8bab230b2ecd,0.188644
8e348601fefc,0.362879
8bd2fe400b89,0.21802
