In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)

PATH='data/avito/'

In [3]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os
import gc
from io import StringIO
from matplotlib_venn import venn2
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
color = sns.color_palette()

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999

In [4]:
train_df = pd.read_csv(f'{PATH}train.csv', parse_dates=["activation_date"])
test_df = pd.read_csv(f'{PATH}test.csv', parse_dates=["activation_date"])
traindex = train_df.index
testdex = test_df.index


In [5]:
train_image_feat = pd.read_csv(f'{PATH}Image_KP_SCORES.csv')
test_image_feat = pd.read_csv(f'{PATH}Image_KP_SCORES_test.csv')
train_image_feat = train_image_feat.rename(columns={'Image_kp_score': 'image_score'})
test_image_feat = test_image_feat.rename(columns={'Image_kp_score': 'image_score'})

In [6]:
len(train_df), len(test_df)

(1503424, 508438)

In [7]:
train_df = train_df.merge(train_image_feat, on='image', how='left')
test_df = test_df.merge(test_image_feat, on='image', how='left')

In [8]:
len(train_df), len(test_df)

(1503424, 508438)

In [9]:
agg_feat = pd.read_csv(f'{PATH}aggregated_features.csv')

In [10]:
train_df = train_df.merge(agg_feat, on='user_id', how='left')
test_df = test_df.merge(agg_feat, on='user_id', how='left')

In [11]:
len(train_df), len(test_df)

(1503424, 508438)

In [12]:
train_df.columns

Index(['item_id', 'user_id', 'region', 'city', 'parent_category_name',
       'category_name', 'param_1', 'param_2', 'param_3', 'title',
       'description', 'price', 'item_seq_number', 'activation_date',
       'user_type', 'image', 'image_top_1', 'deal_probability', 'image_score',
       'avg_days_up_user', 'avg_times_up_user', 'n_user_items'],
      dtype='object')

In [13]:
add_datepart(train_df, 'activation_date')
add_datepart(test_df, 'activation_date')

In [14]:
textfeats = ["description", "title"]

for cols in textfeats:
    train_df[cols] = train_df[cols].astype(str) 
    train_df[cols] = train_df[cols].astype(str).fillna('missing') 
    train_df[cols] = train_df[cols].str.lower()
    test_df[cols] = test_df[cols].astype(str) 
    test_df[cols] = test_df[cols].astype(str).fillna('missing') 
    test_df[cols] = test_df[cols].str.lower()
    train_df['num_words_' + cols] = train_df[cols].apply(lambda comment: len(comment.split()))
    train_df['num_unique_words_' + cols] = train_df[cols].apply(lambda comment: len(set(w for w in comment.split())))
    test_df['num_words_' + cols] = test_df[cols].apply(lambda comment: len(comment.split()))
    test_df['num_unique_words_' + cols] = test_df[cols].apply(lambda comment: len(set(w for w in comment.split())))

In [15]:
count = lambda l1,l2: sum([1 for x in l1 if x in l2])

test_df['words_vs_unique_title'] = test_df['num_unique_words_title'] / test_df['num_words_title'] * 100
test_df['words_vs_unique_description'] = test_df['num_unique_words_description'] / test_df['num_words_description'] * 100
    
test_df['city'] = test_df['region'] + '_' + test_df['city']
test_df['num_desc_punct'] = test_df['description'].apply(lambda x: count(x, set(string.punctuation)))

train_df['words_vs_unique_title'] = train_df['num_unique_words_title'] / train_df['num_words_title'] * 100
train_df['words_vs_unique_description'] = train_df['num_unique_words_description'] / train_df['num_words_description'] * 100
    
train_df['city'] = train_df['region'] + '_' + train_df['city']
train_df['num_desc_punct'] = train_df['description'].apply(lambda x: count(x, set(string.punctuation)))

train_df['words_vs_unique_title'].value_counts()

In [16]:
### TFIDF Vectorizer ###
tfidf_vec = TfidfVectorizer(ngram_range=(1,2))
full_tfidf = tfidf_vec.fit_transform(train_df['title'].values.tolist() + test_df['title'].values.tolist())
train_tfidf = tfidf_vec.transform(train_df['title'].values.tolist())
test_tfidf = tfidf_vec.transform(test_df['title'].values.tolist())

### SVD Components ###
n_comp = 3
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
svd_obj.fit(full_tfidf)
train_svd = pd.DataFrame(svd_obj.transform(train_tfidf))
test_svd = pd.DataFrame(svd_obj.transform(test_tfidf))
train_svd.columns = ['svd_title_'+str(i+1) for i in range(n_comp)]
test_svd.columns = ['svd_title_'+str(i+1) for i in range(n_comp)]
train_df = pd.concat([train_df, train_svd], axis=1)
test_df = pd.concat([test_df, test_svd], axis=1)
del full_tfidf, train_tfidf, test_tfidf, train_svd, test_svd

In [17]:
### TFIDF Vectorizer ###
tfidf_vec = TfidfVectorizer(ngram_range=(1,2), max_features=15000)
full_tfidf = tfidf_vec.fit_transform(train_df['description'].values.tolist() + test_df['description'].values.tolist())
train_tfidf = tfidf_vec.transform(train_df['description'].values.tolist())
test_tfidf = tfidf_vec.transform(test_df['description'].values.tolist())

### SVD Components ###
n_comp = 3
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
svd_obj.fit(full_tfidf)
train_svd = pd.DataFrame(svd_obj.transform(train_tfidf))
test_svd = pd.DataFrame(svd_obj.transform(test_tfidf))
train_svd.columns = ['svd_desc_'+str(i+1) for i in range(n_comp)]
test_svd.columns = ['svd_desc_'+str(i+1) for i in range(n_comp)]
train_df = pd.concat([train_df, train_svd], axis=1)
test_df = pd.concat([test_df, test_svd], axis=1)
del full_tfidf, train_tfidf, test_tfidf, train_svd, test_svd

In [18]:
train_df.head()

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,price,item_seq_number,user_type,image,image_top_1,deal_probability,image_score,avg_days_up_user,avg_times_up_user,n_user_items,activation_Year,activation_Month,activation_Week,activation_Day,activation_Dayofweek,activation_Dayofyear,activation_Is_month_end,activation_Is_month_start,activation_Is_quarter_end,activation_Is_quarter_start,activation_Is_year_end,activation_Is_year_start,activation_Elapsed,num_words_description,num_unique_words_description,num_words_title,num_unique_words_title,words_vs_unique_title,words_vs_unique_description,num_desc_punct,svd_title_1,svd_title_2,svd_title_3,svd_desc_1,svd_desc_2,svd_desc_3
0,b912c3c6a6ad,e00f8ff2eaf9,Свердловская область,Свердловская область_Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,,,кокоби(кокон для сна),"кокон для сна малыша,пользовались меньше месяц...",400.0,2,Private,d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,1008.0,0.12789,3283.0,8.0,2.0,2,2017,3,13,28,1,87,False,False,False,False,False,False,1490659200,7,7,3,3,100.0,100.0,2,0.001775388,-1.3e-05,0.001292,8.242352e-07,0.030402,-0.046948
1,2dac0150717d,39aeb48f0017,Самарская область,Самарская область_Самара,Для дома и дачи,Мебель и интерьер,Другое,,,стойка для одежды,"стойка для одежды, под вешалки. с бутика.",3000.0,19,Private,79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,692.0,0.0,3349.0,,,1,2017,3,12,26,6,85,False,False,False,False,False,False,1490486400,7,7,3,3,100.0,100.0,3,0.00280514,-3e-06,0.002211,6.871823e-07,0.028727,-0.051477
2,ba83aefab5dc,91e2f88dd6e3,Ростовская область,Ростовская область_Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",,,philips bluray,"в хорошем состоянии, домашний кинотеатр с blu ...",4000.0,9,Private,b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,3032.0,0.43177,432.0,4.428571,1.142857,9,2017,3,12,20,0,79,False,False,False,False,False,False,1489968000,17,17,2,2,100.0,100.0,5,3.381291e-06,8e-06,2.6e-05,6.005857e-07,0.212441,0.091467
3,02996f1dd2ea,bf5cccea572d,Татарстан,Татарстан_Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,,,автокресло,продам кресло от0-25кг,2200.0,286,Company,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,796.0,0.80323,620.0,16.714286,2.642857,32,2017,3,12,25,5,84,False,False,False,False,False,False,1490400000,3,3,1,1,100.0,100.0,1,4.792249e-05,2.5e-05,4.9e-05,5.613274e-06,0.085606,-0.082542
4,7c90be56d2ab,ef50846afc0b,Волгоградская область,Волгоградская область_Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110.0,"ваз 2110, 2003",все вопросы по телефону.,40000.0,3,Private,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2264.0,0.20797,5806.0,,,1,2017,3,11,16,3,75,False,False,False,False,False,False,1489622400,4,4,3,3,100.0,100.0,1,3.985457e-07,1e-06,7e-06,4.633977e-06,0.099686,-0.173768


Final Variables

In [19]:
cat_vars = ['region', 'city', 'parent_category_name', 'category_name', 'param_1', 'param_2', 'param_3', 'user_type', 'activation_Week', 'activation_Day', 'activation_Dayofweek', 'image_top_1']
cols_to_drop = ["item_id", "user_id", "title", "description", "image", 'activation_Year', 'activation_Month', 'activation_Is_year_end', 'activation_Is_year_start', 'activation_Elapsed', 'activation_Is_month_end', 'activation_Is_month_start','activation_Is_quarter_end', 'activation_Is_quarter_start', 'activation_Dayofyear']
contin_vars = ['price', 'item_seq_number', 'svd_title_1', 'svd_title_2', 'svd_title_3', 'svd_desc_1', 'svd_desc_2', 'svd_desc_3', 'avg_days_up_user', 'avg_times_up_user', 'n_user_items', 'image_score','num_desc_punct', 'words_vs_unique_description', 'words_vs_unique_title', 'num_unique_words_description', 'num_unique_words_title', 'num_words_description', 'num_words_title']
#deal_probability, 'image_score'
print(len(cat_vars)+len(cols_to_drop)+len(contin_vars))

46


In [20]:
train_df['image_score'].fillna(-1.0, inplace=True)
test_df['image_score'].fillna(-1.0, inplace=True)
#train_df['image_score'] = train_df['image_score'].fillna(train_df['image_score'].mean())
#test_df['image_score'] = test_df['image_score'].fillna(train_df['image_score'].mean())

In [21]:
#train_df['avg_days_up_user'] = train_df['avg_days_up_user'].fillna(train_df['avg_days_up_user'].mean())
#test_df['avg_days_up_user'] = test_df['avg_days_up_user'].fillna(train_df['avg_days_up_user'].mean())
train_df['avg_days_up_user'].fillna(-1.0, inplace=True)
test_df['avg_days_up_user'].fillna(-1.0, inplace=True)

In [22]:
#train_df['avg_times_up_user'] = train_df['avg_times_up_user'].fillna(train_df['avg_times_up_user'].mean())
#test_df['avg_times_up_user'] = test_df['avg_times_up_user'].fillna(train_df['avg_times_up_user'].mean())
train_df['avg_times_up_user'].fillna(-1.0, inplace=True)
test_df['avg_times_up_user'].fillna(-1.0, inplace=True)

In [23]:
train_df['image_top_1'].fillna(-1, inplace=True)
test_df['image_top_1'].fillna(-1, inplace=True)

In [24]:
train_df['param_1'].fillna('Empty Empty', inplace=True)
test_df['param_1'].fillna('Empty Empty', inplace=True)

In [25]:
train_df['param_2'].fillna('Empty Empty', inplace=True)
test_df['param_2'].fillna('Empty Empty', inplace=True)

In [26]:
train_df['param_3'].fillna('Empty Empty', inplace=True)
test_df['param_3'].fillna('Empty Empty', inplace=True)

In [27]:
train_df["price"] = np.log(train_df["price"]+0.001)
test_df["price"] = np.log(test_df["price"]+0.001)
train_df['price'].fillna(-1.0, inplace=True)
test_df['price'].fillna(-1.0, inplace=True)
#train_df['price'].fillna(train_df['price'].mean(), inplace=True)
#test_df['price'].fillna(train_df['price'].mean(), inplace=True)

In [28]:
#to_cat = ['region', 'city', 'parent_category_name', 'category_name', 'param_1', 'param_2', 'param_3', 'user_type']
for col in cat_vars:
    train_df[col] = train_df[col].astype('category')
    test_df[col] = test_df[col].astype('category')

In [29]:
train_df.isna().sum()

item_id                              0
user_id                              0
region                               0
city                                 0
parent_category_name                 0
category_name                        0
param_1                              0
param_2                              0
param_3                              0
title                                0
description                          0
price                                0
item_seq_number                      0
user_type                            0
image                           112588
image_top_1                          0
deal_probability                     0
image_score                          0
avg_days_up_user                     0
avg_times_up_user                    0
n_user_items                         0
activation_Year                      0
activation_Month                     0
activation_Week                      0
activation_Day                       0
activation_Dayofweek     

In [30]:
test_df.isna().sum()

item_id                             0
user_id                             0
region                              0
city                                0
parent_category_name                0
category_name                       0
param_1                             0
param_2                             0
param_3                             0
title                               0
description                         0
price                               0
item_seq_number                     0
user_type                           0
image                           42609
image_top_1                         0
image_score                         0
avg_days_up_user                    0
avg_times_up_user                   0
n_user_items                        0
activation_Year                     0
activation_Month                    0
activation_Week                     0
activation_Day                      0
activation_Dayofweek                0
activation_Dayofyear                0
activation_I

#val_idx = get_cv_idxs(len(train_df))
val_idx = get_cv_idxs(len(train_df),val_pct=0.338186699161381)
#val_idx = [0]

In [31]:
train_y = train_df["deal_probability"].values
test_id = test_df["item_id"].values

In [32]:
train_X = train_df.drop(cols_to_drop, axis=1)

In [33]:
test_X = test_df.drop(cols_to_drop, axis=1)

In [34]:
train_X.isna().sum()

region                          0
city                            0
parent_category_name            0
category_name                   0
param_1                         0
param_2                         0
param_3                         0
price                           0
item_seq_number                 0
user_type                       0
image_top_1                     0
deal_probability                0
image_score                     0
avg_days_up_user                0
avg_times_up_user               0
n_user_items                    0
activation_Week                 0
activation_Day                  0
activation_Dayofweek            0
num_words_description           0
num_unique_words_description    0
num_words_title                 0
num_unique_words_title          0
words_vs_unique_title           0
words_vs_unique_description     0
num_desc_punct                  0
svd_title_1                     0
svd_title_2                     0
svd_title_3                     0
svd_desc_1    

In [35]:
df, y, nas, mapper = proc_df(train_X, 'deal_probability', do_scale=True)

In [36]:
df.head()

Unnamed: 0,region,city,parent_category_name,category_name,param_1,param_2,param_3,price,item_seq_number,user_type,image_top_1,image_score,avg_days_up_user,avg_times_up_user,n_user_items,activation_Week,activation_Day,activation_Dayofweek,num_words_description,num_unique_words_description,num_words_title,num_unique_words_title,words_vs_unique_title,words_vs_unique_description,num_desc_punct,svd_title_1,svd_title_2,svd_title_3,svd_desc_1,svd_desc_2,svd_desc_3
0,20,1302,5,43,250,39,476,-0.392415,-0.133095,2,1007,0.37472,-0.057108,0.790397,-0.201991,3,18,2,-0.452388,-0.491648,-0.176178,-0.174049,0.069167,0.594889,-0.315861,-0.160363,-0.27229,-0.159423,-0.289578,-0.481421,0.06316
1,18,1223,3,23,123,39,476,0.198465,-0.130044,2,691,0.405252,-1.510731,-1.703963,-0.204342,2,16,7,-0.452388,-0.491648,-0.176178,-0.174049,0.069167,0.594889,-0.282152,-0.151042,-0.272196,-0.147596,-0.289578,-0.500349,0.002843
2,17,1162,1,3,85,39,476,0.282829,-0.131839,2,3029,-0.944164,-0.633943,0.077723,-0.185535,2,10,1,-0.203823,-0.161518,-0.717567,-0.718742,0.069167,0.594889,-0.214733,-0.176403,-0.272093,-0.175729,-0.289578,1.57641,1.906593
3,22,1542,5,43,39,39,476,0.107511,-0.082131,1,795,-0.857194,1.350369,1.324903,-0.131467,2,15,6,-0.551814,-0.6237,-1.258955,-1.263436,0.069167,0.594889,-0.34957,-0.176,-0.271933,-0.17543,-0.28956,0.142627,-0.410883
4,5,250,7,1,279,120,45,0.958074,-0.132915,2,2263,1.54187,-1.510731,-1.703963,-0.204342,1,6,4,-0.526958,-0.590687,-0.176178,-0.174049,0.069167,0.594889,-0.34957,-0.17643,-0.272155,-0.175971,-0.289563,0.301794,-1.625843


In [37]:
print ('Categorical Variables:\t\t',cat_vars,'\n\nContinuous Variables:\t\t',contin_vars,'\n\nTotal Length:\t\t',len(cat_vars)+len(contin_vars))

Categorical Variables:		 ['region', 'city', 'parent_category_name', 'category_name', 'param_1', 'param_2', 'param_3', 'user_type', 'activation_Week', 'activation_Day', 'activation_Dayofweek', 'image_top_1'] 

Continuous Variables:		 ['price', 'item_seq_number', 'svd_title_1', 'svd_title_2', 'svd_title_3', 'svd_desc_1', 'svd_desc_2', 'svd_desc_3', 'avg_days_up_user', 'avg_times_up_user', 'n_user_items', 'image_score', 'num_desc_punct', 'words_vs_unique_description', 'words_vs_unique_title', 'num_unique_words_description', 'num_unique_words_title', 'num_words_description', 'num_words_title'] 

Total Length:		 31


In [38]:
test_X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 508438 entries, 0 to 508437
Data columns (total 31 columns):
region                          508438 non-null category
city                            508438 non-null category
parent_category_name            508438 non-null category
category_name                   508438 non-null category
param_1                         508438 non-null category
param_2                         508438 non-null category
param_3                         508438 non-null category
price                           508438 non-null float64
item_seq_number                 508438 non-null int64
user_type                       508438 non-null category
image_top_1                     508438 non-null category
image_score                     508438 non-null float64
avg_days_up_user                508438 non-null float64
avg_times_up_user               508438 non-null float64
n_user_items                    508438 non-null int64
activation_Week                 508438 non-n

In [39]:
df_test, _, nas, mapper = proc_df(test_X, do_scale=True, mapper=mapper, na_dict=nas)

In [40]:
df_test.head()

Unnamed: 0,region,city,parent_category_name,category_name,param_1,param_2,param_3,price,item_seq_number,user_type,image_top_1,image_score,avg_days_up_user,avg_times_up_user,n_user_items,activation_Week,activation_Day,activation_Dayofweek,num_words_description,num_unique_words_description,num_words_title,num_unique_words_title,words_vs_unique_title,words_vs_unique_description,num_desc_punct,svd_title_1,svd_title_2,svd_title_3,svd_desc_1,svd_desc_2,svd_desc_3
0,5,242,5,11,109,178,63,-2.442697,-0.12161,2,2012,-0.370535,-0.137865,0.374671,-0.192588,2,7,2,-0.576671,-0.656713,-0.717567,-0.718742,0.069167,0.594889,-0.383279,-0.175941,-0.272004,-0.173844,-0.289571,1.255885,-3.794499
1,20,1253,9,6,118,33,394,0.198465,-0.132736,2,1,-1.144471,-1.510731,-1.703963,-0.204342,1,5,7,-0.377819,-0.392609,-0.717567,-0.718742,0.069167,0.594889,-0.248442,-0.092866,-0.273198,-0.130361,-0.289564,1.210629,-0.731885
2,13,829,1,3,311,33,394,0.670441,-0.130762,2,2942,0.108723,-0.380135,-0.041056,-0.197289,2,6,1,-0.104397,-0.029466,-1.258955,-1.263436,0.069167,0.594889,-0.34957,-0.176422,-0.272154,-0.17598,-0.28957,-0.163694,-0.384347
3,19,1185,3,5,107,112,394,0.31737,-0.120892,2,1,-1.144471,1.084256,1.3447,-0.162027,2,6,1,-0.328106,-0.326583,-0.176178,-0.174049,0.069167,0.594889,-0.248442,-0.172792,-0.19618,-0.121216,-0.28957,-0.198597,-0.175597
4,15,913,5,43,101,33,394,0.342343,-0.130762,2,995,0.473717,-0.299378,-0.041056,-0.194939,1,4,6,-0.377819,-0.392609,-0.717567,-0.718742,0.069167,0.594889,-0.248442,-0.173672,-0.271326,-0.157382,-0.289561,0.317347,-1.29726


In [41]:
cat_sz = [(c, len(train_X[c].cat.categories)+1) for c in cat_vars]
cat_sz

[('region', 29),
 ('city', 1805),
 ('parent_category_name', 10),
 ('category_name', 48),
 ('param_1', 373),
 ('param_2', 273),
 ('param_3', 1221),
 ('user_type', 4),
 ('activation_Week', 5),
 ('activation_Day', 22),
 ('activation_Dayofweek', 8),
 ('image_top_1', 3064)]

In [42]:
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]
emb_szs

[(29, 15),
 (1805, 50),
 (10, 5),
 (48, 24),
 (373, 50),
 (273, 50),
 (1221, 50),
 (4, 2),
 (5, 3),
 (22, 11),
 (8, 4),
 (3064, 50)]

In [43]:
def inv_y(a): return np.exp(a)

def exp_rmspe(y_pred, targ):
    targ = inv_y(targ)
    pct_var = (targ - inv_y(y_pred))/targ
    return math.sqrt((pct_var**2).mean())

#max_log_y = np.max(y)
#y_range = (0, max_log_y*1.2)
y_range = (0, 1)

In [44]:
y_range

(0, 1)

In [45]:
val_idxs = np.array_split(np.random.permutation(train_df.index),10)

In [46]:
predictions = []

In [47]:
lr = 0.5*(1e-4)

In [48]:
epoch_count = 1
for val_idx in val_idxs:
    print ('Epoch number: ', epoch_count)
    md = ColumnarModelData.from_data_frame(PATH, val_idx, df, y, cat_flds=cat_vars, bs=128, test_df=df_test)
    m = md.get_learner(emb_szs, len(contin_vars), 0.04, 1, [1000,500], [0.001,0.01], y_range=y_range)
    m.fit(lr, 1, metrics=[exp_rmspe])
    m.fit(lr, 2, cycle_len=1, cycle_mult=2,metrics=[exp_rmspe])
    m.fit(lr, 2, cycle_len=1, cycle_mult=3,metrics=[exp_rmspe])
    predictions.append(m.predict(True))
    epoch_count += 1

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmspe                         
    0      0.05202    0.05153    0.200019  



HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmspe                         
    0      0.04921    0.050793   0.201578  
    1      0.050177   0.050439   0.19886                           
    2      0.049258   0.050236   0.20126                           



HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmspe                         
    0      0.051862   0.051583   0.201688  



HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmspe                         
    0      0.050571   0.051075   0.202055  
    1      0.049185   0.050704   0.201319                          
    2      0.050367   0.050579   0.201389                          



HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmspe                         
    0      0.050782   0.051719   0.199793  



HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmspe                         
    0      0.050221   0.051126   0.201874  
    1      0.051058   0.050842   0.200506                          
    2      0.049932   0.050648   0.201443                          



HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmspe                         
    0      0.052165   0.051963   0.196899  



HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmspe                         
    0      0.050236   0.051228   0.201133  
    1      0.049718   0.050799   0.203272                          
    2      0.048881   0.050617   0.200662                          



HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmspe                         
    0      0.052957   0.051439   0.197317  



HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmspe                         
    0      0.05043    0.050773   0.200544  
    1      0.049183   0.050464   0.197971                          
    2      0.051248   0.050235   0.200367                          



HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmspe                         
    0      0.05107    0.05131    0.20038   



HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmspe                         
    0      0.051434   0.050722   0.201084  
    1      0.049636   0.050448   0.19678                           
    2      0.050496   0.050183   0.200664                          



HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmspe                         
    0      0.051085   0.051879   0.203306  



HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmspe                         
    0      0.051518   0.051412   0.201696  
    1      0.050996   0.050929   0.203874                          
    2      0.052929   0.050676   0.200607                          



HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmspe                         
    0      0.053643   0.052039   0.202675  



HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmspe                         
    0      0.048368   0.051555   0.202072  
    1      0.049972   0.051088   0.200198                          
    2      0.049301   0.050905   0.201989                          



HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmspe                         
    0      0.052904   0.052661   0.202013  



HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmspe                         
    0      0.051185   0.052295   0.202739  
    1      0.051466   0.052177   0.205967                          
    2      0.053219   0.052017   0.203348                          



HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmspe                         
    0      0.052559   0.052248   0.204933  



HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmspe                         
    0      0.051637   0.051537   0.201572  
    1      0.049274   0.050996   0.199194                          
    2      0.049661   0.050788   0.200979                          



md = ColumnarModelData.from_data_frame(PATH, val_idx, df, y, cat_flds=cat_vars, bs=128, test_df=df_test)

m = md.get_learner(emb_szs, len(contin_vars), 0.04, 1, [1000,500], [0.001,0.01], y_range=y_range)

m.lr_find()

m.sched.plot(100)

lr = 0.5*(1e-4)

m.fit(lr, 1, metrics=[exp_rmspe])

m.fit(lr, 2, cycle_len=1, cycle_mult=2,metrics=[exp_rmspe])

m.fit(lr, 3, cycle_len=1, cycle_mult=2,metrics=[exp_rmspe])

pred_test=m.predict(True)

In [56]:
mean_predictions = np.mean(predictions, axis=0)

In [59]:
test_df['deal_probability']=mean_predictions

In [60]:
test_df.head()

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,price,item_seq_number,user_type,image,image_top_1,image_score,avg_days_up_user,avg_times_up_user,n_user_items,activation_Year,activation_Month,activation_Week,activation_Day,activation_Dayofweek,activation_Dayofyear,activation_Is_month_end,activation_Is_month_start,activation_Is_quarter_end,activation_Is_quarter_start,activation_Is_year_end,activation_Is_year_start,activation_Elapsed,num_words_description,num_unique_words_description,num_words_title,num_unique_words_title,words_vs_unique_title,words_vs_unique_description,num_desc_punct,svd_title_1,svd_title_2,svd_title_3,svd_desc_1,svd_desc_2,svd_desc_3,deal_probability
0,6544e41a8817,dbe73ad6e4b5,Волгоградская область,Волгоградская область_Волгоград,Личные вещи,Детская одежда и обувь,Для мальчиков,Обувь,25,отдам бесплатно,на ангарском,-1.0,66,Private,a8b57acb5ab304f9c331ac7a074219aed4d349d8aef386...,2020.0,1672.0,7.5,1.5,6,2017,4,16,18,1,108,False,False,False,False,False,False,1492473600,2,2,2,2,100.0,100.0,0,5.4e-05,1.7e-05,0.000172,2e-06,0.184086,-0.336603,0.124695
1,65b9484d670f,2e11806abe57,Свердловская область,Свердловская область_Нижняя Тура,Хобби и отдых,Велосипеды,Дорожные,Empty Empty,Empty Empty,продам велосипед,"продам велосипед kama f200,в нормальном состо...",8.006368,4,Private,,-1.0,-1.0,-1.0,-1.0,1,2017,4,15,16,6,106,False,False,False,False,False,False,1492300800,10,10,2,2,100.0,100.0,4,0.009232,-0.000109,0.003549,4e-06,0.180083,-0.106644,0.075944
2,8bab230b2ecd,0b850bbebb10,Новосибирская область,Новосибирская область_Бердск,Бытовая электроника,Аудио и видео,Телевизоры и проекторы,Empty Empty,Empty Empty,bbk,продам новый телевизор bbk 32 диагональ смарт...,9.615806,15,Private,8c361112cb049745ef2d1b0ae73594fc5c107286b0c942...,2960.0,2708.0,6.0,1.0,4,2017,4,16,17,0,107,False,False,False,False,False,False,1492387200,21,21,1,1,100.0,100.0,1,1e-06,1e-06,7e-06,3e-06,0.058508,-0.080549,0.060955
3,8e348601fefc,5f1d5c3ce0da,Саратовская область,Саратовская область_Саратов,Для дома и дачи,Бытовая техника,Для кухни,Вытяжки,Empty Empty,вытяжка jetair 60,"продам новую вытяжку в упаковке,с документами....",8.411833,70,Private,,-1.0,-1.0,15.066667,2.666667,19,2017,4,16,17,0,107,False,False,False,False,False,False,1492387200,12,12,3,3,100.0,100.0,4,0.000402,0.008005,0.004259,3e-06,0.055421,-0.064875,0.086297
4,8bd2fe400b89,23e2d97bfc7f,Оренбургская область,Оренбургская область_Бузулук,Личные вещи,Товары для детей и игрушки,Детские коляски,Empty Empty,Empty Empty,коляска зима-лето,продам отличную коляску. б/у 1 год. все вопрос...,8.496991,15,Private,bc3cf6deef10840fc302e38eb48fa7748aa1e28d534f8f...,1002.0,3497.0,6.5,1.0,5,2017,4,15,15,5,105,False,False,False,False,False,False,1492214400,10,10,2,2,100.0,100.0,4,0.000305,8.9e-05,0.001451,5e-06,0.101062,-0.149096,0.057388


In [61]:
csv_fn=f'{PATH}sub.csv'

In [62]:
test_df[['item_id','deal_probability']].to_csv(csv_fn, index=False)

In [63]:
FileLink(csv_fn)