In [45]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from functools import reduce

%matplotlib inline
pd.options.mode.chained_assignment = None

SEED = 45

In [46]:
test = pd.read_csv('data/test.csv', usecols=['item_id'])
train = pd.read_csv('data/train.csv', usecols=['item_id', 'deal_probability'])

In [47]:
periods = pd.read_csv('predictions/periods.csv').rename(columns={'deal_probability': 'periods_predicted'})
periods_train = pd.read_csv('predictions/periods_train.csv').rename(columns={'deal_probability': 'periods_predicted'})

In [48]:
activation_date = pd.read_csv('predictions/activation_date.csv').rename(columns={'deal_probability': 'activation_predicted'})
activation_date_train = pd.read_csv('predictions/activation_date_train.csv').rename(columns={'deal_probability': 'activation_predicted'})

In [49]:
tabular = pd.read_csv('predictions/tabular.csv').rename(columns={'deal_probability': 'tabular_predicted'})
tabular_train = pd.read_csv('predictions/tabular_train.csv').rename(columns={'deal_probability': 'tabular_predicted'})

In [50]:
nlp = pd.read_csv('predictions/nlp.csv').rename(columns={'deal_probability': 'nlp_predicted'})
nlp_train = pd.read_csv('predictions/nlp_train.csv').rename(columns={'deal_probability': 'nlp_predicted'})

## Training the ensembling model

In [51]:
train = reduce(lambda left,right: pd.merge(left,right,on='item_id', how='left'), 
              [train, periods_train, activation_date_train, tabular_train, nlp_train]).set_index('item_id')

In [52]:
train.head()

Unnamed: 0_level_0,deal_probability,periods_predicted,activation_predicted,tabular_predicted,nlp_predicted
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
b912c3c6a6ad,0.12789,0.139208,0.139307,0.168598,0.109393
2dac0150717d,0.0,0.139121,0.138812,0.295984,0.173961
ba83aefab5dc,0.43177,0.139138,0.13802,0.236903,0.256733
02996f1dd2ea,0.80323,0.139114,0.141566,0.234682,0.196804
7c90be56d2ab,0.20797,0.139121,0.141675,0.362416,0.344846


In [53]:
test = reduce(lambda left,right: pd.merge(left,right,on='item_id', how='left'), 
              [test, periods, activation_date, tabular, nlp]).set_index('item_id')

In [54]:
test.head()

Unnamed: 0_level_0,periods_predicted,activation_predicted,tabular_predicted,nlp_predicted
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6544e41a8817,0.139211,0.572439,0.053102,0.05533
65b9484d670f,0.139004,0.476374,0.360651,0.113738
8bab230b2ecd,0.139397,0.346118,0.188644,0.17069
8e348601fefc,0.139146,0.346118,0.362879,0.314884
8bd2fe400b89,0.140688,0.540526,0.21802,0.22598


In [55]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train.drop('deal_probability', axis=1), 
                                                    train['deal_probability'], 
                                                    random_state=SEED)

In [35]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RandomizedSearchCV
from utils import config_dict
model = RandomForestRegressor(n_estimators=20, random_state=SEED)
# model = LinearRegression(n_jobs=-1)

In [36]:
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
           oob_score=False, random_state=45, verbose=0, warm_start=False)

In [56]:
model.score(X_test, y_test)

0.2518178182823019

In [57]:
rmse = np.sqrt(np.mean((model.predict(X_test) - y_test) ** 2))
rmse

0.22488091274351957

In [58]:
test['deal_probability'] = model.predict(test.values)

In [59]:
test['deal_probability'][test['deal_probability'] < 0.0] = 0.0
test['deal_probability'][test['deal_probability'] > 1.0] = 1.0

In [60]:
test[['deal_probability']].to_csv('predictions/submission.csv')

In [42]:
import pickle
model_name = 'rf_overfit_ensembled'
# pickle.dump(model, open('saved_models/{}.pickle'.format(model_name), 'wb'))
# model = pickle.load(open('saved_models/{}.pickle'.format(model_name), 'rb'))