In [None]:
!ls lentahack/

In [28]:
import pandas as pd
df_checks = pd.read_csv('./lentahack/20210518_checks.csv')
df_uplift = pd.read_csv('./lentahack/20210518_uplift.csv')
df_sample_submission = pd.read_csv('./lentahack/20210521_sample_submission.csv')
df_hierarchy = pd.read_csv('./lentahack/20210518_hierarchy.csv')
df_offers = pd.read_csv('./lentahack/20210521_offers.csv')

In [29]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
enc.fit(df_hierarchy[['hierarchy_level1', 'hierarchy_level2', 'hierarchy_level3', 'hierarchy_level4']])
df_hierarchy[['hierarchy_level1', 'hierarchy_level2', 'hierarchy_level3', 'hierarchy_level4']] \
= enc.transform(df_hierarchy[['hierarchy_level1', 'hierarchy_level2', 'hierarchy_level3', 'hierarchy_level4']])

In [None]:
df_hierarchy

In [64]:
# drop client_id, shop_id, check_id, time, check_pos, region_name
df_data = pd.merge(pd.merge(df_offers, df_checks, how='left', on=['sku']),
                   df_uplift, how='left', on=['Offer_ID'])

In [65]:
drop_columns = ['client_id', 'shop_id', 'check_id', 'time', 'check_pos', 'region_name']
df_data_dropped = df_data.drop(columns=drop_columns)

In [66]:
dates = ['day', 'start_date', 'end_date']
for date in dates:
    df_data_dropped[date] = pd.to_datetime(df_data_dropped[date],
                                           format='%Y%m%d')

In [67]:
df_data_dropped.iloc[0]

Promo_type                                   Facades
Offer_ID                                         F26
sku                 71c770923102af052f64c6036c9dd590
start_date                       2020-02-13 00:00:00
end_date                         2020-02-19 00:00:00
train_test_group                               train
day                              2019-12-30 00:00:00
promo_id            19695c56fe623b1e575fc340ee02f1d8
num_sales                                        3.0
supplier_price                                 67.92
selling_price                                 101.67
UpLift                                      0.430811
Name: 0, dtype: object

In [68]:
check = df_data_dropped.drop(df_data_dropped[df_data_dropped['day'] > df_data_dropped['start_date']].index)

In [69]:
import numpy as np
np.sum((check['day'] > check['start_date']).values)

0

In [70]:
df_data = check

In [71]:
groups = df_data.groupby(['Offer_ID'])
# for key, items in groups:
#     print(key)
a = groups.get_group('10')
a['train_test_group']

9123932     train
9123933     train
9123934     train
9123935     train
9123936     train
            ...  
10738779    train
10738780    train
10738781    train
10738782    train
10738783    train
Name: train_test_group, Length: 6514, dtype: object

In [72]:
import math
from collections import Counter
def calc_entropy(count):
    total = sum(count.values())
    entropy = 0
    for key, value in count.items():
        entropy += -(value / total) * math.log2(value / total)
    return entropy

def get_product_list(offer_id):
    df = groups.get_group(offer_id)
    product_list = df['sku'].unique()
    return product_list

def offer_diversity(offer_id):
    product_list = get_product_list(offer_id)
    hierarchy_list = []
    for product in product_list:
        hierarchy_list.append(df_hierarchy[df_hierarchy['sku'] == product].drop(['sku'], axis=1).values[0])
    hierarchy_list = list(zip(*hierarchy_list))
    hierarchy_diversity = {}
    for idx, dat in enumerate(hierarchy_list):
        count = Counter(dat)
        entropy = calc_entropy(count)
        hierarchy_diversity[f"level{idx}"] = entropy
    return hierarchy_diversity

def most_freq_hierarchy_levels(offer_id):
    sku = groups.get_group(offer_id)['sku'].mode().iloc[0]
    hierarchy_levels = df_hierarchy[df_hierarchy['sku'] == sku].drop('sku', axis=1).values[0]
    return hierarchy_levels

def process_offer(offer_id):
    df = groups.get_group(offer_id)
    most_freq_item = df['sku'].mode()
    product_list = get_product_list(offer_id)
    diversity = offer_diversity(offer_id)
    number_of_products = len(product_list)
    total_supply_cost = sum(df['supplier_price'] * df['num_sales'])
    total_selling_price = sum(df['selling_price'] * df['num_sales'])
    duration = df['end_date'].iloc[0] - df['start_date'].iloc[0]
    promotype = df['Promo_type'].iloc[0]
    uplift = df['UpLift'].iloc[0]
    hierarchy_levels = most_freq_hierarchy_levels(offer_id)
    train_test = df['train_test_group'].iloc[0]
    features = {'number_of_products': number_of_products, 'diversity_1': diversity['level0'],
                'diversity_2': diversity['level1'], 'diversity_3': diversity['level2'],
                'diversity_4': diversity['level3'], 'total_cost': total_supply_cost,
                'total_sells': total_selling_price, 'duration': duration, 'promotype': promotype,
                'uplift': uplift, 'most_freq_1': hierarchy_levels[0], 'most_freq_2': hierarchy_levels[1],
                'most_freq_3': hierarchy_levels[2], 'most_freq_4': hierarchy_levels[3],
                'train_test_group': train_test}
    return features

In [90]:
series_list = []
for group, _ in groups:
    features = process_offer(group)
    features['Offer_ID'] = group
    series_list.append(features)
    print(f"Finished group {group}")
final_df = pd.DataFrame(series_list)

Finished group 10
Finished group 100
Finished group 101
Finished group 102
Finished group 103
Finished group 104
Finished group 105
Finished group 106
Finished group 107
Finished group 108
Finished group 109
Finished group 11
Finished group 112
Finished group 114
Finished group 115
Finished group 116
Finished group 117
Finished group 118
Finished group 119
Finished group 12
Finished group 120
Finished group 121
Finished group 122
Finished group 124
Finished group 125
Finished group 126
Finished group 127
Finished group 128
Finished group 129
Finished group 13
Finished group 130
Finished group 131
Finished group 133
Finished group 134
Finished group 136
Finished group 137
Finished group 138
Finished group 139
Finished group 14
Finished group 140
Finished group 141
Finished group 142
Finished group 143
Finished group 144
Finished group 145
Finished group 146
Finished group 147
Finished group 148
Finished group 149
Finished group 15
Finished group 150
Finished group 151
Finished group 152

In [92]:
final_df.to_csv('./final_df.csv')

In [225]:
import pandas as pd
final_df = pd.read_csv('./final_df.csv', index_col=None)
final_df.head()

Unnamed: 0.1,Unnamed: 0,number_of_products,diversity_1,diversity_2,diversity_3,diversity_4,total_cost,total_sells,duration,promotype,uplift,most_freq_1,most_freq_2,most_freq_3,most_freq_4,train_test_group,Offer_ID
0,0,8,0.0,0.0,0.0,0.0,1498483.0,2294275.0,13 days,Biweekly,15.941591,1.0,21.0,191.0,515.0,train,10
1,1,12,0.0,0.0,0.0,1.418296,1449687.0,1391430.0,13 days,Seasonal,,0.0,56.0,45.0,1103.0,test,100
2,2,1,0.0,0.0,0.0,0.0,212712.9,310565.0,13 days,Seasonal,1.761594,1.0,27.0,107.0,951.0,train,101
3,3,11,0.0,0.0,0.0,0.0,735580.7,1555836.0,13 days,Seasonal,0.63047,1.0,26.0,31.0,1790.0,train,102
4,4,1,0.0,0.0,0.0,0.0,368027.9,457704.0,13 days,Seasonal,2.505152,1.0,33.0,136.0,1766.0,train,103


In [226]:
from sklearn.preprocessing import OrdinalEncoder
final_df.set_index('Offer_ID') #total_sells - total_cost / total_cost = profit_pct
                               #duration to integers
final_df['pct_change'] = (final_df['total_sells'] - final_df['total_cost']) / final_df['total_cost']
final_df['duration'] = final_df['duration'].apply(lambda x: int(x.split()[0]))
enc_final = OrdinalEncoder()
enc_final.fit(final_df[['promotype']])
final_df[['promotype']] = enc_final.transform(final_df[['promotype']])

In [227]:
final_df.drop(['Unnamed: 0', 'total_cost', 'total_sells'], inplace=True, axis=1)

In [228]:
final_df.set_index('Offer_ID', inplace=True)

In [212]:
final_df.head()

Unnamed: 0_level_0,number_of_products,diversity_1,diversity_2,diversity_3,diversity_4,duration,promotype,uplift,most_freq_1,most_freq_2,most_freq_3,most_freq_4,train_test_group,pct_change
Offer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
10,8,0.0,0.0,0.0,0.0,13,1.0,15.941591,1.0,21.0,191.0,515.0,train,0.531065
100,12,0.0,0.0,0.0,1.418296,13,3.0,,0.0,56.0,45.0,1103.0,test,-0.040186
101,1,0.0,0.0,0.0,0.0,13,3.0,1.761594,1.0,27.0,107.0,951.0,train,0.46002
102,11,0.0,0.0,0.0,0.0,13,3.0,0.63047,1.0,26.0,31.0,1790.0,train,1.115112
103,1,0.0,0.0,0.0,0.0,13,3.0,2.505152,1.0,33.0,136.0,1766.0,train,0.243666


In [211]:
enc_final.categories_

[array(['Billboards', 'Biweekly', 'Facades', 'Seasonal'], dtype=object)]

In [20]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.4.2-py3-none-manylinux2010_x86_64.whl (166.7 MB)
[K     |████████████████████████████████| 166.7 MB 15 kB/s /s eta 0:00:01██████▎ | 157.6 MB 128.5 MB/s eta 0:00:01
Installing collected packages: xgboost
Successfully installed xgboost-1.4.2


In [151]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, BayesianRidge, SGDRegressor
from sklearn.kernel_ridge import KernelRidge

In [103]:
train_features = ['number_of_products', 'diversity_1', 'diversity_2', 'diversity_3', 'diversity_4',
                  'duration', 'promotype', 'most_freq_1', 'most_freq_2', 'most_freq_3', 'most_freq_4',
                  'pct_change']
from sklearn.model_selection import train_test_split
train_df = final_df[final_df['train_test_group'] == 'train']
X = train_df[train_features].values
y = train_df['uplift'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [111]:
from sklearn.model_selection import KFold, GridSearchCV

cvKFold = KFold(n_splits=5, random_state=42, shuffle=True)

decisionTree_params = {'criterion': ['mse', 'friedman_mse',
                                     'mae', 'poisson']}

decitionTree_search = GridSearchCV(DecisionTreeRegressor(),
                                   decisionTree_params, cv=cvKFold)

decitionTree_search.fit(X_train, y_train)

Traceback (most recent call last):
  File "/anaconda/envs/py38_pytorch/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/anaconda/envs/py38_pytorch/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 1247, in fit
    super().fit(
  File "/anaconda/envs/py38_pytorch/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 168, in fit
    raise ValueError("Some value(s) of y are negative which is"
ValueError: Some value(s) of y are negative which is not allowed for Poisson regression.

Traceback (most recent call last):
  File "/anaconda/envs/py38_pytorch/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/anaconda/envs/py38_pytorch/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 1247, in fit
    super().fit(
  File "/anaconda/envs/py38_pytorch/lib/python3.8/

GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=DecisionTreeRegressor(),
             param_grid={'criterion': ['mse', 'friedman_mse', 'mae',
                                       'poisson']})

In [118]:
clf_dt_best = decitionTree_search.best_estimator_
y_pred = clf_dt_best.predict(X_test)

In [120]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_pred)

5.210684531378378

In [122]:
randomforest_params = {'n_estimators': [10, 30, 50, 70, 100, 200, 300],
                       'criterion': ['mse', 'mae']}

randomforest_search = GridSearchCV(RandomForestRegressor(), randomforest_params, cv=cvKFold)

randomforest_search.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=RandomForestRegressor(),
             param_grid={'criterion': ['mse', 'mae'],
                         'n_estimators': [10, 30, 50, 70, 100, 200, 300]})

In [123]:
clf_rf_best = randomforest_search.best_estimator_
y_pred = clf_rf_best.predict(X_test)
mean_absolute_error(y_test, y_pred)

3.88958612135648

In [147]:
for idx, offer_id in enumerate(df_sample_submission['Offer_ID'].values):
    temp = final_df.loc[offer_id]
    x_submission = temp[train_features].values.reshape((1,12))
    y_submission = clf_rf_best.predict(x_submission)
    df_sample_submission['UpLift'].iloc[idx] = y_submission

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_

In [150]:
df_sample_submission.to_csv('submission_1_randomforest.csv', index=False)

In [146]:
df_sample_submission['Offer_ID'].values

array(['386', '182', '173', '375', '379', '63', '377', '381', '387',
       '178', '172', '376', '174', '383', '181', '388', '382', '176',
       '177', '374', '378', '370', '175', '372', '380', 'B140', 'B103',
       'F186', 'F175', 'F162', 'B99', 'F161', 'F196', 'B104', 'F204',
       'F184', 'F193', 'F185', 'B88', 'B94', 'B102', 'F195', 'F176',
       'F172', 'F191', 'F164', 'B89', 'B93', 'F188', 'F197', 'B87', 'B81',
       'F192', 'F177', 'F201', 'F190', 'F200', 'B90', 'F194', 'F178',
       'F180', 'B91', 'B96', 'B100', 'F187', 'B82', 'B95', 'B101', 'F182',
       'B97', 'B79', 'B86', 'F179', 'F174', 'F198', 'F171', 'F203',
       'F181', 'B105', 'F173', 'F183', 'B83', 'F199', 'F163', 'B98', '65',
       '443', '64', '439', '440', '431', '434', '61', '430', '62', '66',
       '441', '442', 'F216', '319', '436', 'F218', '321', '96', 'F220',
       '432', 'F217', '98', '316', '322', 'F215', '444', 'F219', '435',
       '433', '445', '318', '317', '236', '230', 'F205', '232', 'F208'

In [153]:
ridge_params = {'alpha': [0.2, 0.5, 0.8, 1.0, 1.5, 2.0],
                'solver': ['auto', 'svd', 'lsqr', 'sparse_sg', 'sag']}

ridge_search = GridSearchCV(Ridge(), ridge_params, cv=cvKFold)

ridge_search.fit(X_train, y_train)

Traceback (most recent call last):
  File "/anaconda/envs/py38_pytorch/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/anaconda/envs/py38_pytorch/lib/python3.8/site-packages/sklearn/linear_model/_ridge.py", line 762, in fit
    return super().fit(X, y, sample_weight=sample_weight)
  File "/anaconda/envs/py38_pytorch/lib/python3.8/site-packages/sklearn/linear_model/_ridge.py", line 593, in fit
    self.coef_, self.n_iter_ = _ridge_regression(
  File "/anaconda/envs/py38_pytorch/lib/python3.8/site-packages/sklearn/linear_model/_ridge.py", line 397, in _ridge_regression
    raise ValueError("Known solvers are 'sparse_cg', 'cholesky', 'svd'"
ValueError: Known solvers are 'sparse_cg', 'cholesky', 'svd' 'lsqr', 'sag' or 'saga'. Got sparse_sg.

Traceback (most recent call last):
  File "/anaconda/envs/py38_pytorch/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line

GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Ridge(),
             param_grid={'alpha': [0.2, 0.5, 0.8, 1.0, 1.5, 2.0],
                         'solver': ['auto', 'svd', 'lsqr', 'sparse_sg', 'sag']})

In [155]:
clf_ridge_best = ridge_search.best_estimator_
y_pred = clf_ridge_best.predict(X_test)
mean_absolute_error(y_test, y_pred)

4.093214600459393

In [156]:
for idx, offer_id in enumerate(df_sample_submission['Offer_ID'].values):
    temp = final_df.loc[offer_id]
    x_submission = temp[train_features].values.reshape((1,12))
    y_submission = clf_ridge_best.predict(x_submission)
    df_sample_submission['UpLift'].iloc[idx] = y_submission

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_

In [169]:
kernel_ridge_params = {'alpha': [0.01, 0.1, 1.0, 1.5, 2.0],
                       'kernel': ['linear', 'polynomial', 'sigmoid'],
                       'degree': [1, 2, 3, 4],
                       'coef0': [0.1, 1, 5, 10]}
kernel_ridge_search = GridSearchCV(KernelRidge(), kernel_ridge_params, cv=cvKFold)
kernel_ridge_search.fit(X_train, y_train)

  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,


GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=KernelRidge(),
             param_grid={'alpha': [0.01, 0.1, 1.0, 1.5, 2.0],
                         'coef0': [0.1, 1, 5, 10], 'degree': [1, 2, 3, 4],
                         'kernel': ['linear', 'polynomial', 'sigmoid']})

In [170]:
clf_kernel_ridge_best = kernel_ridge_search.best_estimator_
y_pred = clf_kernel_ridge_best.predict(X_test)
mean_absolute_error(y_test, y_pred)

4.209288882007753

In [171]:
for idx, offer_id in enumerate(df_sample_submission['Offer_ID'].values):
    temp = final_df.loc[offer_id]
    x_submission = temp[train_features].values.reshape((1,12))
    y_submission = clf_kernel_ridge_best.predict(x_submission)
    df_sample_submission['UpLift'].iloc[idx] = y_submission

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_

In [173]:
df_sample_submission.to_csv('submission_3_kernelridge.csv', index=False)
df_sample_submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149 entries, 0 to 148
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Offer_ID  149 non-null    object 
 1   UpLift    149 non-null    float64
dtypes: float64(1), object(1)
memory usage: 2.5+ KB


In [174]:
from xgboost import XGBRegressor
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:linear',
}

params['eval_metric'] = "mae"
num_boost_round = 999


In [175]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-mae:3.86221
[1]	Test-mae:3.58876
[2]	Test-mae:3.67201
[3]	Test-mae:3.69005
[4]	Test-mae:3.72784
[5]	Test-mae:3.76984
[6]	Test-mae:3.78050
[7]	Test-mae:3.79362
[8]	Test-mae:3.86548
[9]	Test-mae:3.92233
[10]	Test-mae:3.97039


In [176]:
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'mae'},
    early_stopping_rounds=10
)



In [177]:
cv_results

Unnamed: 0,train-mae-mean,train-mae-std,test-mae-mean,test-mae-std
0,4.054882,0.094335,4.277621,0.401114
1,3.342952,0.086977,3.86735,0.390898
2,2.892698,0.071131,3.743494,0.409155
3,2.602583,0.053137,3.663708,0.413031


In [178]:
cv_results['test-mae-mean'].min()

3.6637084

In [179]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]

In [180]:
min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with max_depth=9, min_child_weight=5
	MAE 3.7351985999999995 for 3 rounds
CV with max_depth=9, min_child_weight=6
	MAE 3.6751554 for 3 rounds
CV with max_depth=9, min_child_weight=7
	MAE 3.6436997999999994 for 3 rounds
CV with max_depth=10, min_child_weight=5
	MAE 3.746454 for 3 rounds
CV with max_depth=10, min_child_weight=6
	MAE 3.6740998000000005 for 5 rounds
CV with max_depth=10, min_child_weight=7
	MAE 3.646859 for 3 rounds
CV with max_depth=11, min_child_weight=5
	MAE 3.7250444000000003 for 4 rounds
CV with max_depth=11, min_child_weight=6
	MAE 3.6564343999999998 for 4 rounds
CV with max_depth=11, min_child_weight=7
	MAE 3.641883 for 3 rounds
Best params: 11, 7, MAE: 3.641883


In [181]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [182]:
min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with subsample=1.0, colsample=1.0
	MAE 3.6418828 for 3 rounds
CV with subsample=1.0, colsample=0.9
	MAE 3.6654416 for 4 rounds
CV with subsample=1.0, colsample=0.8
	MAE 3.6396524 for 4 rounds
CV with subsample=1.0, colsample=0.7
	MAE 3.6335290000000002 for 3 rounds
CV with subsample=0.9, colsample=1.0
	MAE 3.641648 for 4 rounds
CV with subsample=0.9, colsample=0.9
	MAE 3.6295425999999997 for 3 rounds
CV with subsample=0.9, colsample=0.8
	MAE 3.6729262 for 3 rounds
CV with subsample=0.9, colsample=0.7
	MAE 3.6552964 for 3 rounds
CV with subsample=0.8, colsample=1.0
	MAE 3.6784802 for 3 rounds
CV with subsample=0.8, colsample=0.9
	MAE 3.6499610000000002 for 4 rounds
CV with subsample=0.8, colsample=0.8
	MAE 3.7001756 for 3 rounds
CV with subsample=0.8, colsample=0.7
	MAE 3.631246 for 4 rounds
CV with subsample=0.7, colsample=1.0
	MAE 3.6327708000000003 for 3 rounds
CV with subsample=0.7, colsample=0.9
	MAE 3.6508104 for 3 rounds
CV with subsample=0.7, colsample=0.8
	MAE 3.654387599999

In [187]:
# This can take some time…
min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    cv_results = xgb.cv(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            seed=42,
            nfold=5,
            metrics=['mae'],
            early_stopping_rounds=10
          )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))

CV with eta=0.3
	MAE 3.6142399999999997 for 3 rounds

CV with eta=0.2
	MAE 3.6625370000000004 for 5 rounds

CV with eta=0.1
	MAE 3.5988554 for 15 rounds

CV with eta=0.05
	MAE 3.5599162 for 33 rounds

CV with eta=0.01
	MAE 3.5154432 for 173 rounds

CV with eta=0.005
	MAE 3.5217802 for 347 rounds

Best params: 0.01, MAE: 3.5154432


In [188]:
params

{'max_depth': 11,
 'min_child_weight': 7,
 'eta': 0.005,
 'subsample': 0.7,
 'colsample_bytree': 0.7,
 'objective': 'reg:linear',
 'eval_metric': 'mae'}

In [189]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-mae:4.79306
[1]	Test-mae:4.77401
[2]	Test-mae:4.75655
[3]	Test-mae:4.73762
[4]	Test-mae:4.71951
[5]	Test-mae:4.70000
[6]	Test-mae:4.68101
[7]	Test-mae:4.66381
[8]	Test-mae:4.64899
[9]	Test-mae:4.63012
[10]	Test-mae:4.61335
[11]	Test-mae:4.59306
[12]	Test-mae:4.57557
[13]	Test-mae:4.55731
[14]	Test-mae:4.54071
[15]	Test-mae:4.52443
[16]	Test-mae:4.50683
[17]	Test-mae:4.49076
[18]	Test-mae:4.47531
[19]	Test-mae:4.45789
[20]	Test-mae:4.44138
[21]	Test-mae:4.42653
[22]	Test-mae:4.41026
[23]	Test-mae:4.39746
[24]	Test-mae:4.38146
[25]	Test-mae:4.36517
[26]	Test-mae:4.35160
[27]	Test-mae:4.33786
[28]	Test-mae:4.32493
[29]	Test-mae:4.30740
[30]	Test-mae:4.29205
[31]	Test-mae:4.27755
[32]	Test-mae:4.26285
[33]	Test-mae:4.25099
[34]	Test-mae:4.23767
[35]	Test-mae:4.22741
[36]	Test-mae:4.21541
[37]	Test-mae:4.20377
[38]	Test-mae:4.18974
[39]	Test-mae:4.17777
[40]	Test-mae:4.16339
[41]	Test-mae:4.15129
[42]	Test-mae:4.14183
[43]	Test-mae:4.12944
[44]	Test-mae:4.11816
[45]	Test-mae:4.1075

In [190]:
num_boost_round = model.best_iteration + 1
best_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)

[0]	Test-mae:4.79306
[1]	Test-mae:4.77401
[2]	Test-mae:4.75654
[3]	Test-mae:4.73762
[4]	Test-mae:4.71951
[5]	Test-mae:4.70000
[6]	Test-mae:4.68101
[7]	Test-mae:4.66381
[8]	Test-mae:4.64899
[9]	Test-mae:4.63012
[10]	Test-mae:4.61335
[11]	Test-mae:4.59306
[12]	Test-mae:4.57557
[13]	Test-mae:4.55731
[14]	Test-mae:4.54071
[15]	Test-mae:4.52443
[16]	Test-mae:4.50683
[17]	Test-mae:4.49076
[18]	Test-mae:4.47531
[19]	Test-mae:4.45789
[20]	Test-mae:4.44138
[21]	Test-mae:4.42653
[22]	Test-mae:4.41026
[23]	Test-mae:4.39745
[24]	Test-mae:4.38146
[25]	Test-mae:4.36517
[26]	Test-mae:4.35160
[27]	Test-mae:4.33787
[28]	Test-mae:4.32493
[29]	Test-mae:4.30740
[30]	Test-mae:4.29205
[31]	Test-mae:4.27755
[32]	Test-mae:4.26285
[33]	Test-mae:4.25099
[34]	Test-mae:4.23767
[35]	Test-mae:4.22741
[36]	Test-mae:4.21540
[37]	Test-mae:4.20377
[38]	Test-mae:4.18974
[39]	Test-mae:4.17777
[40]	Test-mae:4.16339
[41]	Test-mae:4.15129
[42]	Test-mae:4.14183
[43]	Test-mae:4.12944
[44]	Test-mae:4.11816
[45]	Test-mae:4.1075

In [210]:
params

{'max_depth': 11,
 'min_child_weight': 7,
 'eta': 0.005,
 'subsample': 0.7,
 'colsample_bytree': 0.7,
 'objective': 'reg:linear',
 'eval_metric': 'mae'}

In [194]:
mean_absolute_error(best_model.predict(dtest), y_test)

3.4498291742183116

In [196]:
best_model.predict(dtest)
dtest_check = xgb.DMatrix(X_test)
best_model.predict(dtest_check)

array([3.5930972 , 2.0601358 , 2.8864458 , 3.4330957 , 2.6861017 ,
       1.8609277 , 4.058944  , 3.2080877 , 2.5117145 , 6.2394137 ,
       5.907185  , 2.467363  , 1.4451088 , 2.416274  , 0.6558658 ,
       1.400278  , 2.1039124 , 1.4262695 , 3.5516796 , 2.89685   ,
       2.5249686 , 7.153087  , 2.0148735 , 0.9616272 , 2.0504806 ,
       2.581698  , 3.5129247 , 3.1062267 , 2.4173028 , 2.6482759 ,
       4.608874  , 0.80097026, 6.4056416 , 2.9945705 , 5.3266535 ,
       4.159297  , 1.6044095 , 6.3490424 , 1.8799944 , 1.6634748 ,
       0.35603288, 2.7783139 , 3.4297566 , 0.8521674 , 2.268777  ,
       2.1700177 , 3.1089041 , 5.4029703 , 1.5668063 , 3.878774  ,
       5.773406  , 1.8184434 , 1.7785423 , 3.3643641 , 5.087581  ,
       2.2896345 , 5.5364065 , 2.8933733 , 2.198037  , 2.8320394 ,
       6.9044514 , 2.0346088 , 0.7074167 , 3.2142396 , 0.58918566,
       2.1215932 , 2.14869   , 2.8657246 , 2.1258957 , 3.4455957 ,
       2.6994817 , 3.0300505 , 4.199069  , 2.3463628 , 5.76643

In [201]:
## XGBOOST

for idx, offer_id in enumerate(df_sample_submission['Offer_ID'].values):
    temp = final_df.loc[offer_id]
    x_submission = temp[train_features].values.reshape((1,12))
    x_xgb_submission = xgb.DMatrix(x_submission)
    y_submission = best_model.predict(x_xgb_submission)
    df_sample_submission['UpLift'].iloc[idx] = y_submission

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [203]:
df_sample_submission.to_csv('./submission_5_xgb.csv', index=False)

In [205]:
from sklearn.ensemble import AdaBoostRegressor
ada_params = {'n_estimators': [50, 80, 100, 150],
              'learning_rate': [0.1, 0.01, 1],
              'loss': ['linear', 'square', 'exponential']}

ada_search = GridSearchCV(AdaBoostRegressor(), ada_params, verbose=3, cv=cvKFold)

ada_search.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END learning_rate=0.1, loss=linear, n_estimators=50; total time=   0.2s
[CV 2/5] END learning_rate=0.1, loss=linear, n_estimators=50; total time=   0.2s
[CV 3/5] END learning_rate=0.1, loss=linear, n_estimators=50; total time=   0.2s
[CV 4/5] END learning_rate=0.1, loss=linear, n_estimators=50; total time=   0.2s
[CV 5/5] END learning_rate=0.1, loss=linear, n_estimators=50; total time=   0.2s
[CV 1/5] END learning_rate=0.1, loss=linear, n_estimators=80; total time=   0.3s
[CV 2/5] END learning_rate=0.1, loss=linear, n_estimators=80; total time=   0.3s
[CV 3/5] END learning_rate=0.1, loss=linear, n_estimators=80; total time=   0.3s
[CV 4/5] END learning_rate=0.1, loss=linear, n_estimators=80; total time=   0.3s
[CV 5/5] END learning_rate=0.1, loss=linear, n_estimators=80; total time=   0.3s
[CV 1/5] END learning_rate=0.1, loss=linear, n_estimators=100; total time=   0.4s
[CV 2/5] END learning_rate=0.1, loss=linear, n

GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=AdaBoostRegressor(),
             param_grid={'learning_rate': [0.1, 0.01, 1],
                         'loss': ['linear', 'square', 'exponential'],
                         'n_estimators': [50, 80, 100, 150]},
             verbose=3)

In [206]:
clf_ada_best = ada_search.best_estimator_
y_pred = clf_ada_best.predict(X_test)
mean_absolute_error(y_test, y_pred)

3.914889382239916

In [207]:
# Ada Boost

for idx, offer_id in enumerate(df_sample_submission['Offer_ID'].values):
    temp = final_df.loc[offer_id]
    x_submission = temp[train_features].values.reshape((1,12))
    y_submission = clf_ada_best.predict(x_submission)
    df_sample_submission['UpLift'].iloc[idx] = y_submission

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_

In [209]:
df_sample_submission.to_csv('submission_6_adaboost.csv', index=False)

# Dropped most_freq3, most_freq4

In [220]:
train_df_dropped = train_df.drop(['most_freq_3', 'most_freq_4'], axis=1)
train_features_dropped = ['number_of_products', 'diversity_1', 'diversity_2', 'diversity_3', 'diversity_4',
                          'duration', 'promotype', 'most_freq_1', 'most_freq_2',
                          'pct_change']
X = train_df_dropped[train_features_dropped].values
y = train_df_dropped['uplift'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [222]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

num_boost_round = model.best_iteration + 1
best_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)

[0]	Test-mae:4.79389
[1]	Test-mae:4.77338
[2]	Test-mae:4.75703
[3]	Test-mae:4.73772
[4]	Test-mae:4.71878
[5]	Test-mae:4.69961
[6]	Test-mae:4.68049
[7]	Test-mae:4.66092
[8]	Test-mae:4.64351
[9]	Test-mae:4.62601
[10]	Test-mae:4.60844
[11]	Test-mae:4.58899
[12]	Test-mae:4.57079
[13]	Test-mae:4.55457
[14]	Test-mae:4.53873
[15]	Test-mae:4.52385
[16]	Test-mae:4.50800
[17]	Test-mae:4.49174
[18]	Test-mae:4.47675
[19]	Test-mae:4.46018
[20]	Test-mae:4.44478
[21]	Test-mae:4.42941
[22]	Test-mae:4.41534
[23]	Test-mae:4.39938
[24]	Test-mae:4.38571
[25]	Test-mae:4.36829
[26]	Test-mae:4.35282
[27]	Test-mae:4.33825
[28]	Test-mae:4.32392
[29]	Test-mae:4.31120
[30]	Test-mae:4.29623
[31]	Test-mae:4.28234
[32]	Test-mae:4.27134
[33]	Test-mae:4.25622
[34]	Test-mae:4.24212
[35]	Test-mae:4.23002
[36]	Test-mae:4.21835
[37]	Test-mae:4.20534
[38]	Test-mae:4.19220
[39]	Test-mae:4.18225
[40]	Test-mae:4.16994
[41]	Test-mae:4.15937
[42]	Test-mae:4.14697
[43]	Test-mae:4.13290
[44]	Test-mae:4.12126
[45]	Test-mae:4.1114

In [223]:
mean_absolute_error(best_model.predict(dtest), y_test)

3.4444593498230454

In [230]:
## XGBOOST dropped

for idx, offer_id in enumerate(df_sample_submission['Offer_ID'].values):
    temp = final_df.loc[offer_id]
    x_submission = temp[train_features_dropped].values.reshape((1,10))
    x_xgb_submission = xgb.DMatrix(x_submission)
    y_submission = best_model.predict(x_xgb_submission)
    df_sample_submission['UpLift'].iloc[idx] = y_submission

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [232]:
df_sample_submission.to_csv('./submission_10_xgboostdropped.csv', index=False)

# Correlation

In [251]:
test_df = final_df[final_df['train_test_group'] == 'test']
for idx in test_df.index.values:
    x_dat = test_df.loc[idx][train_features_dropped].values.reshape((1,10))
    x_dat = xgb.DMatrix(x_dat)
    uplift = best_model.predict(x_dat)
    test_df.loc[idx, 'uplift'] = uplift

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, v, pi)


In [252]:
test_df

Unnamed: 0_level_0,number_of_products,diversity_1,diversity_2,diversity_3,diversity_4,duration,promotype,uplift,most_freq_1,most_freq_2,most_freq_3,most_freq_4,train_test_group,pct_change
Offer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
100,12,0.0,0.0,0.000000,1.418296,13,3.0,2.637078,0.0,56.0,45.0,1103.0,test,-0.040186
172,20,0.0,0.0,0.000000,0.000000,13,1.0,2.954882,0.0,55.0,290.0,1362.0,test,0.148496
173,5,0.0,0.0,0.000000,0.000000,13,1.0,5.412592,1.0,10.0,366.0,1169.0,test,0.598428
174,2,0.0,0.0,0.000000,0.000000,13,1.0,5.619349,1.0,26.0,24.0,1624.0,test,0.707571
175,40,0.0,0.0,0.970951,1.811037,13,1.0,1.650677,0.0,56.0,324.0,99.0,test,0.023548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
F216,2,0.0,0.0,0.000000,0.000000,6,2.0,1.713539,1.0,28.0,167.0,1770.0,test,0.176145
F217,3,0.0,0.0,0.918296,1.584963,6,2.0,2.137022,1.0,14.0,86.0,4.0,test,0.439634
F218,2,0.0,0.0,1.000000,1.000000,6,2.0,0.981655,1.0,50.0,192.0,1182.0,test,0.146987
F219,6,0.0,0.0,0.000000,0.000000,6,2.0,4.446361,1.0,6.0,217.0,573.0,test,0.463911


In [269]:
test_df_dropped = test_df.drop(['train_test_group'], axis=1)
cols = test_df_dropped.columns.to_list()

In [270]:
cols = cols[:7] + cols[8:]
cols.append('uplift')
test_df_dropped = test_df_dropped[cols]

In [272]:
test_df_dropped.head()

Unnamed: 0_level_0,number_of_products,diversity_1,diversity_2,diversity_3,diversity_4,duration,promotype,most_freq_1,most_freq_2,most_freq_3,most_freq_4,pct_change,uplift
Offer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
100,12,0.0,0.0,0.0,1.418296,13,3.0,0.0,56.0,45.0,1103.0,-0.040186,2.637078
172,20,0.0,0.0,0.0,0.0,13,1.0,0.0,55.0,290.0,1362.0,0.148496,2.954882
173,5,0.0,0.0,0.0,0.0,13,1.0,1.0,10.0,366.0,1169.0,0.598428,5.412592
174,2,0.0,0.0,0.0,0.0,13,1.0,1.0,26.0,24.0,1624.0,0.707571,5.619349
175,40,0.0,0.0,0.970951,1.811037,13,1.0,0.0,56.0,324.0,99.0,0.023548,1.650677


In [276]:
test_df_dropped.corr()['uplift'][:]

number_of_products   -0.206771
diversity_1                NaN
diversity_2          -0.068341
diversity_3          -0.361696
diversity_4          -0.194266
duration              0.343850
promotype            -0.197747
most_freq_1           0.161738
most_freq_2          -0.397977
most_freq_3           0.117316
most_freq_4           0.003139
pct_change            0.288458
uplift                1.000000
Name: uplift, dtype: float64

In [278]:
billboard_mean = test_df_dropped[test_df_dropped['promotype'] == 1.0]['uplift'].mean()
biweekly_mean = test_df_dropped[test_df_dropped['promotype'] == 2.0]['uplift'].mean()
facades_mean = test_df_dropped[test_df_dropped['promotype'] == 3.0]['uplift'].mean()
seasonal_mean = test_df_dropped[test_df_dropped['promotype'] == 4.0]['uplift'].mean()

In [279]:
print(billboard_mean, biweekly_mean, facades_mean, seasonal_mean)

4.383321343398675 2.3894107423576654 2.5893180334206782 nan
