In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

# Any results you write to the current directory are saved as output.

train = pd.read_csv("D:/train.tsv", sep='\t')
test = pd.read_csv("D:/test_stg2.tsv", sep='\t')

In [4]:
# new features
# len of description

train['item_description'] = train['item_description'].astype(str)
test['item_description'] = test['item_description'].astype(str)

train['des_len'] = train['item_description'].apply(lambda x: len(x))
test['des_len'] = test['item_description'].apply(lambda x: len(x))

In [6]:
# words in description
train['word_count'] = train['item_description'].apply(lambda x: len(x.split()))
test['word_count'] = test['item_description'].apply(lambda x: len(x.split()))

In [7]:
# men len of words in description inversed and scaled
train['mean_des'] = train['item_description'].apply(lambda x: float(len(x.split())) / len(x))  * 10
test['mean_des'] = test['item_description'].apply(lambda x: float(len(x.split())) / len(x)) * 10 

In [8]:
# length of name
train['name_len'] = train['name'].apply(lambda x: len(x))
test['name_len'] = test['name'].apply(lambda x: len(x))

In [9]:
# words in name
train['word_name'] = train['name'].apply(lambda x: len(x.split()))
test['word_name'] = test['name'].apply(lambda x: len(x.split()))

In [10]:
# men len of words in name inversed and scaled
train['mean_name'] = train['name'].apply(lambda x: float(len(x.split())) / len(x))  * 10
test['mean_name'] = test['name'].apply(lambda x: float(len(x.split())) / len(x)) * 10 

In [11]:
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,des_len,word_count,mean_des,name_len,word_name,mean_name
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet,18,3,1.666667,35,7,2.0
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,188,36,1.914894,32,4,1.25
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,124,29,2.33871,14,2,1.428571
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...,173,32,1.849711,21,3,1.428571
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity,41,5,1.219512,20,4,2.0


In [12]:
# missing value imputation
train.isnull().sum()

train_id                  0
name                      0
item_condition_id         0
category_name          6327
brand_name           632682
price                     0
shipping                  0
item_description          0
des_len                   0
word_count                0
mean_des                  0
name_len                  0
word_name                 0
mean_name                 0
dtype: int64

In [13]:
train['category_name'].value_counts()

Women/Athletic Apparel/Pants, Tights, Leggings                 60177
Women/Tops & Blouses/T-Shirts                                  46380
Beauty/Makeup/Face                                             34335
Beauty/Makeup/Lips                                             29910
Electronics/Video Games & Consoles/Games                       26557
Beauty/Makeup/Eyes                                             25215
Electronics/Cell Phones & Accessories/Cases, Covers & Skins    24676
Women/Underwear/Bras                                           21274
Women/Tops & Blouses/Tank, Cami                                20284
Women/Tops & Blouses/Blouse                                    20284
Women/Dresses/Above Knee, Mini                                 20082
Women/Jewelry/Necklaces                                        19758
Women/Athletic Apparel/Shorts                                  19528
Beauty/Makeup/Makeup Palettes                                  19103
Women/Shoes/Boots                 

In [14]:
train['category_name'].fillna('ppp/ppp/ppp', inplace=True)
test['category_name'].fillna('ppp/ppp/ppp', inplace=True)

In [15]:
train['elec'] = train['category_name'].apply(lambda x : int('electronics' in x.lower()))
test['elec'] = test['category_name'].apply(lambda x : int('electronics' in x.lower()))

In [17]:
train['brand_name'].fillna('ttttttt', inplace=True)
test['brand_name'].fillna('ttttttt', inplace=True)

In [19]:
train.isnull().sum()

train_id             0
name                 0
item_condition_id    0
category_name        0
brand_name           0
price                0
shipping             0
item_description     0
des_len              0
word_count           0
mean_des             0
name_len             0
word_name            0
mean_name            0
elec                 0
dtype: int64

In [20]:
test.isnull().sum()

test_id              0
name                 0
item_condition_id    0
category_name        0
brand_name           0
shipping             0
item_description     0
des_len              0
word_count           0
mean_des             0
name_len             0
word_name            0
mean_name            0
elec                 0
dtype: int64

In [21]:
#length of category words
train['cat_lennn'] = train['category_name'].apply(lambda x: len(x))
test['cat_lennn'] = test['category_name'].apply(lambda x: len(x))

In [22]:
def was_priced(x):
    return int('[rm]' in x)
               
train['rm'] = train['item_description'].apply( lambda x : was_priced(x))
test['rm'] = test['item_description'].apply( lambda x : was_priced(x))

In [23]:
train['was_described'] = 1
test['was_described'] = 1

train.loc[ train['item_description'] == 'No description yet','was_described'] = 0
test.loc[ test['item_description'] == 'No description yet','was_described'] = 0

In [24]:
# description containes 'new' word
train['new'] = train['item_description'].apply(lambda x : int('new' in x.lower()))
test['new'] = test['item_description'].apply(lambda x : int('new' in x.lower()))

In [25]:
# splitting subcategories of category_name
train_cat = pd.DataFrame(train.category_name.str.split('/',2).tolist(),
                                   columns = ['sub1','sub2', 'sub3'])
train['sub1'] = train_cat['sub1']
train['sub2'] = train_cat['sub2']
train['sub3'] = train_cat['sub3']

test_cat = pd.DataFrame(test.category_name.str.split('/',2).tolist(),
                                   columns = ['sub1','sub2', 'sub3'])

test['sub1'] = test_cat['sub1']
test['sub2'] = test_cat['sub2']
test['sub3'] = test_cat['sub3']

train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,des_len,word_count,...,word_name,mean_name,elec,cat_lennn,rm,was_described,new,sub1,sub2,sub3
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,ttttttt,10.0,1,No description yet,18,3,...,7,2.0,0,17,0,0,0,Men,Tops,T-shirts
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,188,36,...,4,1.25,1,50,0,1,0,Electronics,Computers & Tablets,Components & Parts
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,124,29,...,2,1.428571,0,27,0,1,0,Women,Tops & Blouses,Blouse
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,ttttttt,35.0,1,New with tags. Leather horses. Retail for [rm]...,173,32,...,3,1.428571,0,34,1,1,1,Home,Home Décor,Home Décor Accents
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,ttttttt,44.0,0,Complete with certificate of authenticity,41,5,...,4,2.0,0,23,0,1,0,Women,Jewelry,Necklaces


In [26]:
train['hand'] = train['category_name'].apply(lambda x : int('handmade' in x.lower()))
test['hand'] = test['category_name'].apply(lambda x : int('handmade' in x.lower()))

In [27]:
train['men'] = train['category_name'].apply(lambda x : int('men' in x.lower()))
test['men'] = test['category_name'].apply(lambda x : int('men' in x.lower()))

In [28]:
# int in description
import re
train['int_desc'] = train['item_description'].apply(lambda x : int(bool(re.search(r'\d',x))))
test['int_desc'] = test['item_description'].apply(lambda x : int(bool(re.search(r'\d',x))))

In [29]:
# integer was present in name
train['int_name'] = train['name'].apply(lambda x : int(bool(re.search(r'\d',x))))
test['int_name'] = test['name'].apply(lambda x : int(bool(re.search(r'\d',x))))

In [30]:
# word condition was present in description
train['cond'] = train['item_description'].apply(lambda x : int('condition' in x.lower()))
test['cond'] = test['item_description'].apply(lambda x : int('condition' in x.lower()))

In [31]:
train['category_name'].value_counts()

Women/Athletic Apparel/Pants, Tights, Leggings                 60177
Women/Tops & Blouses/T-Shirts                                  46380
Beauty/Makeup/Face                                             34335
Beauty/Makeup/Lips                                             29910
Electronics/Video Games & Consoles/Games                       26557
Beauty/Makeup/Eyes                                             25215
Electronics/Cell Phones & Accessories/Cases, Covers & Skins    24676
Women/Underwear/Bras                                           21274
Women/Tops & Blouses/Blouse                                    20284
Women/Tops & Blouses/Tank, Cami                                20284
Women/Dresses/Above Knee, Mini                                 20082
Women/Jewelry/Necklaces                                        19758
Women/Athletic Apparel/Shorts                                  19528
Beauty/Makeup/Makeup Palettes                                  19103
Women/Shoes/Boots                 

In [32]:
# converting price to log scale
positive = train['price'].values > 0
negative = train['price'].values < 0
train['price'] = np.piecewise(train['price'], (positive, negative), (np.log, lambda x: -np.log(-x)))

features = ['int_name',  'cond','int_desc', 'new', 'was_described', 'men', 'rm', 'item_condition_id','cat_lennn',  'brand_name', 'shipping', 'des_len', 'name_len','mean_des', 'word_count', 'mean_name', 'word_name', 'sub1', 'sub2', 'hand', 'elec', 'category_name']

data = train[features]
data_sub = test[features]

y = train['price']

In [33]:
data_sub.head()

Unnamed: 0,int_name,cond,int_desc,new,was_described,men,rm,item_condition_id,cat_lennn,brand_name,...,name_len,mean_des,word_count,mean_name,word_name,sub1,sub2,hand,elec,category_name
0,0,0,1,0,1,1,0,1,19,ttttttt,...,40,3.333333,2,2.0,8,Women,Jewelry,0,0,Women/Jewelry/Rings
1,1,0,1,1,1,0,0,1,39,ttttttt,...,40,1.513944,38,1.75,7,Other,Office supplies,0,0,Other/Office supplies/Shipping Supplies
2,0,0,0,1,1,0,1,1,46,Coach,...,9,2.0,11,2.222222,2,Vintage & Collectibles,Bags and Purses,0,0,Vintage & Collectibles/Bags and Purses/Handbag
3,0,0,0,0,1,1,0,2,23,ttttttt,...,13,1.492537,10,1.538462,2,Women,Sweaters,0,0,Women/Sweaters/Cardigan
4,0,1,1,0,1,0,0,3,35,ttttttt,...,16,1.736527,29,1.875,3,Other,Books,0,0,Other/Books/Religion & Spirituality


In [34]:
#label encoding
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

frames = [ data, data_sub ]
xx = pd.concat(frames)


l = [ 'brand_name', 'sub1', 'sub2', 'category_name']
for x in l :
    le.fit(xx[x])
    data[x] = le.transform(data[x])
    data_sub[x] = le.transform(data_sub[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [35]:
data.head()

Unnamed: 0,int_name,cond,int_desc,new,was_described,men,rm,item_condition_id,cat_lennn,brand_name,...,name_len,mean_des,word_count,mean_name,word_name,sub1,sub2,hand,elec,category_name
0,0,0,0,0,0,1,0,3,17,6309,...,35,1.666667,3,2.0,7,5,102,0,0,864
1,0,1,0,0,1,0,0,3,50,4660,...,32,1.914894,36,1.25,4,1,30,0,1,87
2,0,0,1,0,1,1,0,1,27,5489,...,14,2.33871,29,1.428571,2,9,103,0,0,1320
3,0,0,0,1,1,0,1,1,34,6309,...,21,1.849711,32,1.428571,3,3,55,0,0,536
4,1,0,0,0,1,1,0,1,23,6309,...,20,1.219512,5,2.0,4,9,58,0,0,1247


In [36]:
from sklearn import ensemble
clf =  ensemble.GradientBoostingRegressor( learning_rate = 0.7, n_estimators=700, max_depth = 3,warm_start = True, verbose=1, random_state=45, max_features = 0.8)
clf.fit(data, y)

      Iter       Train Loss   Remaining Time 
         1           0.5617           30.19m
         2           0.5379           25.12m
         3           0.5214           25.43m
         4           0.5097           25.86m
         5           0.5012           26.06m
         6           0.4948           26.11m
         7           0.4906           26.76m
         8           0.4848           26.70m
         9           0.4769           26.71m
        10           0.4735           26.20m
        20           0.4459           24.11m
        30           0.4304           23.77m
        40           0.4168           23.64m
        50           0.4077           23.22m
        60           0.4014           22.85m
        70           0.3965           22.38m
        80           0.3918           21.99m
        90           0.3858           21.52m
       100           0.3829           21.21m
       200           0.3612           17.58m
       300           0.3521           14.05m
       40

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.7, loss='ls', max_depth=3, max_features=0.8,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=700, presort='auto', random_state=45,
             subsample=1.0, verbose=1, warm_start=True)

In [37]:
# predicting and saving to output file
predicted = clf.predict(data_sub) 

print(features)
print( clf.feature_importances_)

['int_name', 'cond', 'int_desc', 'new', 'was_described', 'men', 'rm', 'item_condition_id', 'cat_lennn', 'brand_name', 'shipping', 'des_len', 'name_len', 'mean_des', 'word_count', 'mean_name', 'word_name', 'sub1', 'sub2', 'hand', 'elec', 'category_name']
[ 0.01619746  0.00570796  0.01622108  0.01096618  0.00108968  0.00628054
  0.00936334  0.04015075  0.09132368  0.31128848  0.03036243  0.03771549
  0.02539165  0.0276034   0.02471134  0.0275539   0.01060858  0.00996127
  0.07815146  0.00426548  0.00933585  0.20575001]


In [38]:
out = pd.DataFrame()

In [39]:
out['test_id'] = test['test_id']
out['price'] = predicted
out['price'] = np.exp(out['price'])

In [40]:
out.head()

Unnamed: 0,test_id,price
0,0,9.859434
1,1,9.466358
2,2,41.026956
3,3,17.050238
4,4,10.612051


In [41]:
out.to_csv("d:/output_DynamicPricing.csv",index=False)