In [19]:
from sklearn.metrics import mean_squared_log_error
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [8]:
%%time
# Configure notebook display settings to only use 2 decimal places, tables look nicer.
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', 15) 
pd.set_option('display.max_rows', 50)


CPU times: user 68 µs, sys: 6 µs, total: 74 µs
Wall time: 78 µs


In [9]:
# Setting random value for the dataframe
SEED = 40

In [11]:
#Reading the train file 
ROOT_PATH = '/kaggle/input/playground-series-s3e11'
train = pd.read_csv(ROOT_PATH+'/train.csv')
train.head()

Unnamed: 0,id,store_sales(in millions),unit_sales(in millions),total_children,num_children_at_home,avg_cars_at home(approx).1,gross_weight,...,store_sqft,coffee_bar,video_store,salad_bar,prepared_food,florist,cost
0,0,8.61,3.0,2.0,2.0,2.0,10.3,...,36509.0,0.0,0.0,0.0,0.0,0.0,62.09
1,1,5.0,2.0,4.0,0.0,3.0,6.66,...,28206.0,1.0,0.0,0.0,0.0,0.0,121.8
2,2,14.08,4.0,0.0,0.0,3.0,21.3,...,21215.0,1.0,0.0,0.0,0.0,0.0,83.51
3,3,4.02,3.0,5.0,0.0,0.0,14.8,...,21215.0,1.0,0.0,0.0,0.0,0.0,66.78
4,4,2.13,3.0,5.0,0.0,3.0,17.0,...,27694.0,1.0,1.0,1.0,1.0,1.0,111.51


In [5]:
#Reading the test file
test = pd.read_csv(ROOT_PATH+'/test.csv')
test.head()

Unnamed: 0,id,store_sales(in millions),unit_sales(in millions),total_children,num_children_at_home,avg_cars_at home(approx).1,gross_weight,recyclable_package,low_fat,units_per_case,store_sqft,coffee_bar,video_store,salad_bar,prepared_food,florist
0,360336,7.24,4.0,1.0,0.0,2.0,10.8,0.0,1.0,7.0,20319.0,0.0,0.0,0.0,0.0,0.0
1,360337,6.9,2.0,2.0,2.0,3.0,8.51,1.0,0.0,4.0,33858.0,1.0,0.0,1.0,1.0,1.0
2,360338,8.34,3.0,0.0,0.0,3.0,8.77,0.0,1.0,14.0,39696.0,0.0,0.0,1.0,1.0,0.0
3,360339,5.48,2.0,3.0,3.0,2.0,21.9,1.0,0.0,9.0,23688.0,1.0,1.0,1.0,1.0,1.0
4,360340,4.8,3.0,2.0,0.0,2.0,10.9,1.0,0.0,11.0,27694.0,1.0,1.0,1.0,1.0,1.0


In [6]:
#Reading the sample_submission file
sample = pd.read_csv(ROOT_PATH+'/sample_submission.csv')
sample.head()

Unnamed: 0,id,cost
0,360336,99.615
1,360337,99.615
2,360338,99.615
3,360339,99.615
4,360340,99.615


In [13]:
#Describing the datasets 
train.describe()

Unnamed: 0,id,store_sales(in millions),unit_sales(in millions),total_children,num_children_at_home,avg_cars_at home(approx).1,gross_weight,...,store_sqft,coffee_bar,video_store,salad_bar,prepared_food,florist,cost
count,360336.0,360336.0,360336.0,360336.0,360336.0,360336.0,360336.0,...,360336.0,360336.0,360336.0,360336.0,360336.0,360336.0,360336.0
mean,180167.5,6.34,3.04,2.46,0.69,2.2,13.82,...,28180.33,0.56,0.28,0.5,0.5,0.5,99.61
std,104020.19,3.31,0.78,1.49,1.21,1.08,4.61,...,5968.87,0.5,0.45,0.5,0.5,0.5,29.94
min,0.0,0.51,1.0,0.0,0.0,0.0,6.0,...,20319.0,0.0,0.0,0.0,0.0,0.0,50.79
25%,90083.75,3.72,3.0,1.0,0.0,1.0,9.71,...,23593.0,0.0,0.0,0.0,0.0,0.0,70.32
50%,180167.5,5.78,3.0,2.0,0.0,2.0,13.6,...,27694.0,1.0,0.0,1.0,1.0,1.0,98.81
75%,270251.25,8.4,4.0,4.0,1.0,3.0,17.7,...,33858.0,1.0,1.0,1.0,1.0,1.0,126.62
max,360335.0,22.92,6.0,5.0,5.0,4.0,21.9,...,39696.0,1.0,1.0,1.0,1.0,1.0,149.75


In [14]:
# Checking if there are any null values in train datasets
train.isnull().sum()

id                            0
store_sales(in millions)      0
unit_sales(in millions)       0
total_children                0
num_children_at_home          0
avg_cars_at home(approx).1    0
gross_weight                  0
recyclable_package            0
low_fat                       0
units_per_case                0
store_sqft                    0
coffee_bar                    0
video_store                   0
salad_bar                     0
prepared_food                 0
florist                       0
cost                          0
dtype: int64

In [16]:
# Checking the duplicate values in the train dataset
train.duplicated().sum()

0

In [18]:
# Checking the shape of train datasets
train.shape

# rows = 360336
# cols = 17

(360336, 17)

In [21]:
media_campaign = pd.read_csv('/kaggle/input/media-campaign-cost-prediction/train_dataset.csv')
media_campaign.head()

Unnamed: 0,store_sales(in millions),unit_sales(in millions),total_children,num_children_at_home,avg_cars_at home(approx).1,gross_weight,recyclable_package,...,store_sqft,coffee_bar,video_store,salad_bar,prepared_food,florist,cost
0,2.68,2.0,1.0,0.0,2.0,6.3,1.0,...,30584.0,1.0,1.0,1.0,1.0,1.0,79.59
1,5.73,3.0,5.0,5.0,3.0,18.7,1.0,...,20319.0,0.0,0.0,0.0,0.0,0.0,118.36
2,2.62,2.0,1.0,1.0,1.0,9.21,0.0,...,20319.0,0.0,0.0,0.0,0.0,0.0,67.2
3,11.73,3.0,1.0,1.0,3.0,10.9,1.0,...,23112.0,1.0,1.0,1.0,1.0,1.0,60.87
4,4.82,2.0,1.0,1.0,3.0,8.15,0.0,...,38382.0,0.0,0.0,0.0,0.0,0.0,86.79


In [23]:
cat_cols = ["unit_sales(in millions)", "total_children", "num_children_at_home", "avg_cars_at home(approx).1", "recyclable_package",
"low_fat", "store_sqft", "coffee_bar", "video_store", "salad_bar", "prepared_food"]

In [24]:
df_full = pd.concat([train, media_campaign])
for col in cat_cols:
    df_full[col] = df_full[col].astype('int64')

In [26]:
X_train, X_val, y_train, y_val = train_test_split(
    df_full.drop(["id", "cost"], axis=1),df_full["cost"], test_size=0.2, random_state=SEED)

train_pool = Pool(X_train, y_train, cat_features=cat_cols)
eval_pool = Pool(X_val, y_val, cat_features=cat_cols)
clf = CatBoostRegressor(iterations=1300, learning_rate=0.015, early_stopping_rounds=200)
clf.fit(train_pool, eval_set=eval_pool)

0:	learn: 29.9156391	test: 29.9731860	best: 29.9731860 (0)	total: 258ms	remaining: 5m 34s
1:	learn: 29.8939974	test: 29.9519569	best: 29.9519569 (1)	total: 379ms	remaining: 4m 6s
2:	learn: 29.8726971	test: 29.9310346	best: 29.9310346 (2)	total: 565ms	remaining: 4m 4s
3:	learn: 29.8516256	test: 29.9103815	best: 29.9103815 (3)	total: 739ms	remaining: 3m 59s
4:	learn: 29.8326619	test: 29.8921226	best: 29.8921226 (4)	total: 896ms	remaining: 3m 51s
5:	learn: 29.8128124	test: 29.8726919	best: 29.8726919 (5)	total: 1.02s	remaining: 3m 39s
6:	learn: 29.7949470	test: 29.8555710	best: 29.8555710 (6)	total: 1.14s	remaining: 3m 30s
7:	learn: 29.7763064	test: 29.8375822	best: 29.8375822 (7)	total: 1.25s	remaining: 3m 21s
8:	learn: 29.7580067	test: 29.8196325	best: 29.8196325 (8)	total: 1.4s	remaining: 3m 20s
9:	learn: 29.7408577	test: 29.8028343	best: 29.8028343 (9)	total: 1.53s	remaining: 3m 17s
10:	learn: 29.7237462	test: 29.7863604	best: 29.7863604 (10)	total: 1.64s	remaining: 3m 12s
11:	learn: 

<catboost.core.CatBoostRegressor at 0x7f5361553cd0>

In [27]:
for col in cat_cols:
    test[col] = test[col].astype('int64')

In [28]:
test["cost"] = clf.predict(test.drop("id", axis=1))


In [29]:
test[["id", "cost"]].to_csv("submission.csv", index=False)

In [30]:
actual_submission = pd.read_csv('/kaggle/working/submission.csv')
actual_submission.head()

Unnamed: 0,id,cost
0,360336,99.08
1,360337,97.5
2,360338,97.05
3,360339,102.94
4,360340,84.18
