# Main 

Zexuan: The main file for category ML model production

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import sqlite3
from sqlalchemy.engine import create_engine

from sklearn import datasets
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from tpot import TPOTClassifier

import tpot_functions 
import periods 
import utils

from utils import featurize_date_col


Define some key variables and numbers

In [3]:
target = 'deal_probability'
LIMIT = None
TIMEOUT_MINS = 30
#SCORING = 'neg_mean_squared_error'
RS = 27

In [4]:
#train = pd.read_csv('data/train.csv')[:LIMIT]
train = pd.read_csv('data/train.csv', parse_dates=['activation_date'], infer_datetime_format=True)[:LIMIT]
train_pd = pd.read_csv('train_price_normalized.csv')[:LIMIT]
#train_periods = pd.read_csv('data/periods_aggregate_features_items.csv')

In [5]:
list(train)

['item_id',
 'user_id',
 'region',
 'city',
 'parent_category_name',
 'category_name',
 'param_1',
 'param_2',
 'param_3',
 'title',
 'description',
 'price',
 'item_seq_number',
 'activation_date',
 'user_type',
 'image',
 'image_top_1',
 'deal_probability']

In [6]:
train_yes = train[['description','param_1','param_2','param_3','image','image_top_1']].isnull().astype(int)
train_yes.head()

Unnamed: 0,description,param_1,param_2,param_3,image,image_top_1
0,0,0,1,1,0,0
1,0,0,1,1,0,0
2,0,0,1,1,0,0
3,0,0,1,1,0,0
4,0,0,0,0,0,0


In [7]:
#train_date = pd.read_csv('data/train.csv', parse_dates=['activation_date'], infer_datetime_format=True)
#train_date

In [8]:
#city = preprocessing.LabelEncoder()
#city.fit(train['city'])  
#list(city.classes_)
#train['city'] = city.transform(train['city'])

#userid = preprocessing.LabelEncoder()
#userid.fit(train['user_id'])  
#list(userid.classes_)
#train['user_id'] = userid.transform(train['user_id'])

In [9]:
#train = pd.concat([train_pd[['item_id', 'price', 'norm_price', target]],
#                  pd.get_dummies(train[['user_type','category_name', 'region', 'city', 'user_id', 'item_seq_number']])], 
#                  axis=1) 

train_pd = pd.concat([train_pd[['item_id', 'norm_price',target]], train_yes], axis=1)
train = pd.concat([train_pd,pd.get_dummies(train[['user_type','category_name', 'region','activation_date']])], axis=1)      


In [10]:
list(train)

['item_id',
 'norm_price',
 'deal_probability',
 'description',
 'param_1',
 'param_2',
 'param_3',
 'image',
 'image_top_1',
 'activation_date',
 'user_type_Company',
 'user_type_Private',
 'user_type_Shop',
 'category_name_Автомобили',
 'category_name_Аквариум',
 'category_name_Аудио и видео',
 'category_name_Билеты и путешествия',
 'category_name_Бытовая техника',
 'category_name_Велосипеды',
 'category_name_Водный транспорт',
 'category_name_Гаражи и машиноместа',
 'category_name_Готовый бизнес',
 'category_name_Грузовики и спецтехника',
 'category_name_Детская одежда и обувь',
 'category_name_Дома, дачи, коттеджи',
 'category_name_Другие животные',
 'category_name_Земельные участки',
 'category_name_Игры, приставки и программы',
 'category_name_Квартиры',
 'category_name_Книги и журналы',
 'category_name_Коллекционирование',
 'category_name_Коммерческая недвижимость',
 'category_name_Комнаты',
 'category_name_Кошки',
 'category_name_Красота и здоровье',
 'category_name_Мебель и ин

In [11]:
#train_new = pd.merge(train, train_periods, on='item_id')
#train_new

In [12]:
train_all = train.dropna()
train_all.head()

Unnamed: 0,item_id,norm_price,deal_probability,description,param_1,param_2,param_3,image,image_top_1,activation_date,...,region_Саратовская область,region_Свердловская область,region_Ставропольский край,region_Татарстан,region_Тульская область,region_Тюменская область,region_Удмуртия,region_Ханты-Мансийский АО,region_Челябинская область,region_Ярославская область
0,b912c3c6a6ad,-0.001144,0.12789,0,0,1,1,0,0,2017-03-28,...,0,1,0,0,0,0,0,0,0,0
1,2dac0150717d,-0.006929,0.0,0,0,1,1,0,0,2017-03-26,...,0,0,0,0,0,0,0,0,0,0
2,ba83aefab5dc,-0.002041,0.43177,0,0,1,1,0,0,2017-03-20,...,0,0,0,0,0,0,0,0,0,0
3,02996f1dd2ea,-0.001126,0.80323,0,0,1,1,0,0,2017-03-25,...,0,0,0,1,0,0,0,0,0,0
4,7c90be56d2ab,-0.002323,0.20797,0,0,0,0,0,0,2017-03-16,...,0,0,0,0,0,0,0,0,0,0


In [13]:
train_all = train_all.sort_values(by='activation_date')

In [14]:
train_all = train_all.rename(columns={'activation_date': 'item_activation_date'})
train_all = featurize_date_col(train_all, 'item_activation_date', remove_when_done=True)

In [15]:
train_all = train_all.set_index('item_id')
train_all.head()

Unnamed: 0_level_0,norm_price,deal_probability,description,param_1,param_2,param_3,image,image_top_1,user_type_Company,user_type_Private,...,region_Татарстан,region_Тульская область,region_Тюменская область,region_Удмуртия,region_Ханты-Мансийский АО,region_Челябинская область,region_Ярославская область,item_activation_date_isholiday,item_activation_date_wday,item_activation_date_yday
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
349806114e4f,-0.003418,0.0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,2,74
c9a6482ed8bc,-0.012256,0.0,0,1,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,2,74
a4c713ae3e8f,-0.002166,0.0,0,0,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,2,74
8d22cc5b3bc2,-0.000906,0.10173,0,0,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,2,74
961f7342f92a,-0.001646,0.0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,2,74


In [16]:
X = (train_all.drop([target], axis=1)).values
y = train_all[target].values

tss = TimeSeriesSplit(n_splits=4)
train_index, test_index = list(tss.split(X))[-1]
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

In [17]:
# TimeSeriesSplit NOT RIGHT YET
#tss = TimeSeriesSplit(n_splits=4)
#train_index, test_index = list(tss.split(X))[-1]
#X_train, X_test = X[train_index], X[test_index]
#y_train, y_test = y[train_index], y[test_index]

TPOT functions

In [18]:
#tpot_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,
#                      random_state=42, verbosity=2)

In [21]:
#tpot_optimizer = tpot_functions.TpotAutoml(mode='regression', max_time_mins=TIMEOUT_MINS, 
#                                           scoring=SCORING, random_state=RS, n_jobs=-1,verbosity=2,
#                                           generations = 2, population_size=15)


In [20]:
#tpot_optimizer.fit(X_train, y_train)

In [None]:
#top_scores = tpot_optimizer.get_top_models(return_scores=True)
#print('\ntop cv scores:')
#print(top_scores)
#print('\ntop models')
#print(tpot_optimizer.top_models)
#print('\nthe best test score:')
#test_score = tpot_optimizer.score(X_test, y_test)
#print(test_score)

Try RandomForestRegressor model 

In [79]:
#RFR = RandomForestRegressor(random_state=0, n_estimators=100)

#config_dict = {'sklearn.ensemble.RandomForestRegressor': {
#        'n_estimators': [100, 200, 400],
#        'loss': ["ls", "lad", "huber", "quantile"],
#        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
#        'max_depth': range(1, 11),
#        'min_samples_split': range(2, 21),
#        'min_samples_leaf': range(1, 21),
#        'subsample': np.arange(0.05, 1.01, 0.05),
#        'max_features': np.arange(0.05, 1.01, 0.05),
#        'alpha': [0.75, 0.8, 0.85, 0.9, 0.95, 0.99]
#    }}
#RSCV = RandomizedSearchCV(estimator=GBR, random_state=RS,
#                          param_distributions=config_dict['sklearn.ensemble.RandomForestRegressor'],
#                         n_iter=5,
#                         scoring='r2',
#                         verbose=2)

In [80]:
#RSCV.fit(X_train, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] subsample=0.2, n_estimators=400, min_samples_split=2, min_samples_leaf=11, max_features=0.9500000000000001, max_depth=3, loss=ls, learning_rate=1.0, alpha=0.75 
[CV]  subsample=0.2, n_estimators=400, min_samples_split=2, min_samples_leaf=11, max_features=0.9500000000000001, max_depth=3, loss=ls, learning_rate=1.0, alpha=0.75, total= 7.8min
[CV] subsample=0.2, n_estimators=400, min_samples_split=2, min_samples_leaf=11, max_features=0.9500000000000001, max_depth=3, loss=ls, learning_rate=1.0, alpha=0.75 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  8.0min remaining:    0.0s


[CV]  subsample=0.2, n_estimators=400, min_samples_split=2, min_samples_leaf=11, max_features=0.9500000000000001, max_depth=3, loss=ls, learning_rate=1.0, alpha=0.75, total= 8.5min
[CV] subsample=0.2, n_estimators=400, min_samples_split=2, min_samples_leaf=11, max_features=0.9500000000000001, max_depth=3, loss=ls, learning_rate=1.0, alpha=0.75 
[CV]  subsample=0.2, n_estimators=400, min_samples_split=2, min_samples_leaf=11, max_features=0.9500000000000001, max_depth=3, loss=ls, learning_rate=1.0, alpha=0.75, total= 8.4min
[CV] subsample=0.9500000000000001, n_estimators=200, min_samples_split=18, min_samples_leaf=5, max_features=0.6500000000000001, max_depth=6, loss=huber, learning_rate=0.5, alpha=0.75 
[CV]  subsample=0.9500000000000001, n_estimators=200, min_samples_split=18, min_samples_leaf=5, max_features=0.6500000000000001, max_depth=6, loss=huber, learning_rate=0.5, alpha=0.75, total=17.6min
[CV] subsample=0.9500000000000001, n_estimators=200, min_samples_split=18, min_samples_le

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 123.0min finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=4, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=500, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=5, n_jobs=1,
          param_distributions={'n_estimators': [100, 200, 400], 'loss': ['ls', 'lad', 'huber', 'quantile'], 'learning_rate': [0.001, 0.01, 0.1, 0.5, 1.0], 'max_depth': range(1, 11), 'min_samples_split': range(2, 21), 'min_samples_leaf': range(1, 21), 'subsample': array([0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.3...0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ]), 'alpha': [0.75, 0.8, 0.85, 0.9, 0.95, 0.99]},
  

Try GradientBoostingRegressor model 

In [10]:
#GBR = GradientBoostingRegressor(alpha=0.85, learning_rate=0.1, loss="ls",
#                                              max_features=0.9, min_samples_leaf=5,
#                                              min_samples_split=6)

In [62]:
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}
config_dict = {'sklearn.ensemble.GradientBoostingRegressor': {
        'n_estimators': [100, 200, 400],
        'loss': ["ls", "lad", "huber", "quantile"],
        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
        'max_depth': range(1, 11),
        'min_samples_split': range(2, 21),
        'min_samples_leaf': range(1, 21),
        'subsample': np.arange(0.05, 1.01, 0.05),
        'max_features': np.arange(0.05, 1.01, 0.05),
        'alpha': [0.75, 0.8, 0.85, 0.9, 0.95, 0.99]
    },
        'sklearn.ensemble.RandomForestRegressor': {
        'n_estimators': [100, 200, 400],
        'max_features': np.arange(0.05, 1.01, 0.05),
        'min_samples_split': range(2, 21),
        'min_samples_leaf': range(1, 21),
        'bootstrap': [True, False]
    },
              }

GBR  = GradientBoostingRegressor()
RSCV = RandomizedSearchCV(estimator=GBR, random_state=RS,
                          param_distributions=config_dict['sklearn.ensemble.GradientBoostingRegressor'],
                         n_iter=20,
                         scoring='r2',
                         verbose=2)



In [63]:
RSCV.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] subsample=0.6500000000000001, n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=0.35000000000000003, max_depth=9, loss=ls, learning_rate=0.001, alpha=0.9 
[CV]  subsample=0.6500000000000001, n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=0.35000000000000003, max_depth=9, loss=ls, learning_rate=0.001, alpha=0.9, total=47.7min
[CV] subsample=0.6500000000000001, n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=0.35000000000000003, max_depth=9, loss=ls, learning_rate=0.001, alpha=0.9 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 47.9min remaining:    0.0s


[CV]  subsample=0.6500000000000001, n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=0.35000000000000003, max_depth=9, loss=ls, learning_rate=0.001, alpha=0.9, total=50.9min
[CV] subsample=0.6500000000000001, n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=0.35000000000000003, max_depth=9, loss=ls, learning_rate=0.001, alpha=0.9 
[CV]  subsample=0.6500000000000001, n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=0.35000000000000003, max_depth=9, loss=ls, learning_rate=0.001, alpha=0.9, total=45.4min
[CV] subsample=0.9000000000000001, n_estimators=400, min_samples_split=17, min_samples_leaf=19, max_features=0.45, max_depth=2, loss=quantile, learning_rate=0.1, alpha=0.99 
[CV]  subsample=0.9000000000000001, n_estimators=400, min_samples_split=17, min_samples_leaf=19, max_features=0.45, max_depth=2, loss=quantile, learning_rate=0.1, alpha=0.99, total= 5.9min
[CV] subsample=0.9000000000000001, n_estimators=400, min_samp

[CV]  subsample=1.0, n_estimators=400, min_samples_split=10, min_samples_leaf=2, max_features=0.25, max_depth=7, loss=ls, learning_rate=0.01, alpha=0.75, total=20.0min
[CV] subsample=1.0, n_estimators=400, min_samples_split=10, min_samples_leaf=2, max_features=0.25, max_depth=7, loss=ls, learning_rate=0.01, alpha=0.75 
[CV]  subsample=1.0, n_estimators=400, min_samples_split=10, min_samples_leaf=2, max_features=0.25, max_depth=7, loss=ls, learning_rate=0.01, alpha=0.75, total=19.9min
[CV] subsample=0.6000000000000001, n_estimators=100, min_samples_split=3, min_samples_leaf=8, max_features=0.6500000000000001, max_depth=2, loss=lad, learning_rate=0.5, alpha=0.99 
[CV]  subsample=0.6000000000000001, n_estimators=100, min_samples_split=3, min_samples_leaf=8, max_features=0.6500000000000001, max_depth=2, loss=lad, learning_rate=0.5, alpha=0.99, total= 1.8min
[CV] subsample=0.6000000000000001, n_estimators=100, min_samples_split=3, min_samples_leaf=8, max_features=0.6500000000000001, max_dep

[CV]  subsample=0.7500000000000001, n_estimators=400, min_samples_split=16, min_samples_leaf=6, max_features=0.55, max_depth=5, loss=ls, learning_rate=1.0, alpha=0.9, total=19.5min
[CV] subsample=0.7500000000000001, n_estimators=400, min_samples_split=16, min_samples_leaf=6, max_features=0.55, max_depth=5, loss=ls, learning_rate=1.0, alpha=0.9 
[CV]  subsample=0.7500000000000001, n_estimators=400, min_samples_split=16, min_samples_leaf=6, max_features=0.55, max_depth=5, loss=ls, learning_rate=1.0, alpha=0.9, total=19.5min
[CV] subsample=0.7500000000000001, n_estimators=400, min_samples_split=16, min_samples_leaf=6, max_features=0.55, max_depth=5, loss=ls, learning_rate=1.0, alpha=0.9 
[CV]  subsample=0.7500000000000001, n_estimators=400, min_samples_split=16, min_samples_leaf=6, max_features=0.55, max_depth=5, loss=ls, learning_rate=1.0, alpha=0.9, total=18.8min
[CV] subsample=0.8, n_estimators=100, min_samples_split=20, min_samples_leaf=14, max_features=0.7000000000000001, max_depth=1

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 1674.4min finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=20, n_jobs=1,
          param_distributions={'n_estimators': [100, 200, 400], 'loss': ['ls', 'lad', 'huber', 'quantile'], 'learning_rate': [0.001, 0.01, 0.1, 0.5, 1.0], 'max_depth': range(1, 11), 'min_samples_split': range(2, 21), 'min_samples_leaf': range(1, 21), 'subsample': array([0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.3...0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ]), 'alpha': [0.75, 0.8, 0.85, 0.9, 0.95, 0.99]},
  

In [64]:
mse = mean_squared_error(y_test, RSCV.predict(X_test))
print("MSE: %.4f" % mse)

MSE: 0.0556


In [65]:
# Does not make sense
r2 = r2_score(y_test, RSCV.predict(X_test))
print("R2: %.4f" % r2)

R2: 0.1554


In [66]:
results = RSCV.predict(X_test)
results

array([0.12774658, 0.12875546, 0.17323898, ..., 0.27985864, 0.21808471,
       0.13581655])

In [1]:
import pickle
#filename = 'main_category3.sav'
#pickle.dump(RSCV, open(filename, 'wb'))

NameError: name 'RSCV' is not defined

In [17]:
import pickle
filename = 'main_category3.sav'
RSCV = pickle.load(open(filename, 'rb'))

In [19]:
y_train = RSCV.predict(X)

In [22]:
train_all.item_id

AttributeError: 'DataFrame' object has no attribute 'item_id'

In [24]:
d = {'item_id': train_all.index, 'deal_probability': y_train}
train_output = pd.DataFrame(data=d)
train_output.to_csv('predictions/tabular_train.csv')


Load test.csv file and compute 

In [25]:
test = pd.read_csv('data/test.csv', parse_dates=['activation_date'], infer_datetime_format=True)[:LIMIT]
test_pd = pd.read_csv('train_price_normalized.csv')[:LIMIT]

In [26]:
list(test)

['item_id',
 'user_id',
 'region',
 'city',
 'parent_category_name',
 'category_name',
 'param_1',
 'param_2',
 'param_3',
 'title',
 'description',
 'price',
 'item_seq_number',
 'activation_date',
 'user_type',
 'image',
 'image_top_1']

In [27]:
test_yes = test[['description','param_1','param_2','param_3','image','image_top_1']].isnull().astype(int)
test_yes.head()

Unnamed: 0,description,param_1,param_2,param_3,image,image_top_1
0,0,0,0,0,0,0
1,0,0,1,1,1,1
2,0,0,1,1,0,0
3,0,0,0,1,1,1
4,0,0,1,1,0,0


In [28]:
test_pd = pd.concat([test_pd[['item_id', 'norm_price']], test_yes], axis=1)
test = pd.concat([test_pd,pd.get_dummies(test[['user_type','category_name', 'region','activation_date']])], axis=1)      

In [29]:
list(test)

['item_id',
 'norm_price',
 'description',
 'param_1',
 'param_2',
 'param_3',
 'image',
 'image_top_1',
 'activation_date',
 'user_type_Company',
 'user_type_Private',
 'user_type_Shop',
 'category_name_Автомобили',
 'category_name_Аквариум',
 'category_name_Аудио и видео',
 'category_name_Билеты и путешествия',
 'category_name_Бытовая техника',
 'category_name_Велосипеды',
 'category_name_Водный транспорт',
 'category_name_Гаражи и машиноместа',
 'category_name_Готовый бизнес',
 'category_name_Грузовики и спецтехника',
 'category_name_Детская одежда и обувь',
 'category_name_Дома, дачи, коттеджи',
 'category_name_Другие животные',
 'category_name_Земельные участки',
 'category_name_Игры, приставки и программы',
 'category_name_Квартиры',
 'category_name_Книги и журналы',
 'category_name_Коллекционирование',
 'category_name_Коммерческая недвижимость',
 'category_name_Комнаты',
 'category_name_Кошки',
 'category_name_Красота и здоровье',
 'category_name_Мебель и интерьер',
 'category_n

In [30]:
test_all = test.dropna()
test_all.head()

Unnamed: 0,item_id,norm_price,description,param_1,param_2,param_3,image,image_top_1,activation_date,user_type_Company,...,region_Саратовская область,region_Свердловская область,region_Ставропольский край,region_Татарстан,region_Тульская область,region_Тюменская область,region_Удмуртия,region_Ханты-Мансийский АО,region_Челябинская область,region_Ярославская область
0,b912c3c6a6ad,-0.001144,0.0,0.0,0.0,0.0,0.0,0.0,2017-04-18,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2dac0150717d,-0.006929,0.0,0.0,1.0,1.0,1.0,1.0,2017-04-16,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ba83aefab5dc,-0.002041,0.0,0.0,1.0,1.0,0.0,0.0,2017-04-17,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,02996f1dd2ea,-0.001126,0.0,0.0,0.0,1.0,1.0,1.0,2017-04-17,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7c90be56d2ab,-0.002323,0.0,0.0,1.0,1.0,0.0,0.0,2017-04-15,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
test_all = test_all.sort_values(by='activation_date')

In [32]:
test_all = test_all.rename(columns={'activation_date': 'item_activation_date'})
test_all = featurize_date_col(test_all, 'item_activation_date', remove_when_done=True)

In [33]:
test_all = test_all.set_index('item_id')
test_all.head()

Unnamed: 0_level_0,norm_price,description,param_1,param_2,param_3,image,image_top_1,user_type_Company,user_type_Private,user_type_Shop,...,region_Татарстан,region_Тульская область,region_Тюменская область,region_Удмуртия,region_Ханты-Мансийский АО,region_Челябинская область,region_Ярославская область,item_activation_date_isholiday,item_activation_date_wday,item_activation_date_yday
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cb9ef110ab72,-0.002013,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,102
0798f62fa89a,-0.005856,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,102
e0913e2c8949,0.073953,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,2,102
9c6ccea00772,-0.001636,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,102
d9e7c31b484e,-0.0041,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,2,102


In [34]:
test_id = test_all.index
test2 = test_all

test2_probability = RSCV.predict(test2)
test2_probability


array([0.12037002, 0.16437036, 0.24334344, ..., 0.19429552, 0.0711233 ,
       0.16254484])

In [35]:
print(len(test2))
print(len(test2_probability))
print(len(test_id))

479516
479516
479516


In [36]:
d = {'item_id': test_id, 'deal_probability': test2_probability}
test_output = pd.DataFrame(data=d)
test_output

Unnamed: 0,item_id,deal_probability
0,cb9ef110ab72,0.120370
1,0798f62fa89a,0.164370
2,e0913e2c8949,0.243343
3,9c6ccea00772,0.217760
4,d9e7c31b484e,0.174191
5,9cba3aaab555,0.155752
6,752af4d8f70e,0.146125
7,b352a3a0267d,0.053525
8,fd664c9054c6,0.228341
9,29cc8ae1b2cf,0.353961


In [37]:
test_output.to_csv('tabular3.csv')