In [30]:
import os,sys,string,re
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import matplotlib.pyplot as plt

to_space= '''&,()+'"/'''
trans=string.maketrans(to_space, ' '*len(to_space))
ps = PorterStemmer()

os.chdir('/data/arpit.goel/31_PricePrediction/03.Submissions')
pd.set_option('display.float_format', lambda x: '%.1f' % x)

df_raw_train=pd.read_csv('../input/train.tsv',delimiter='\t')
df_raw_test=pd.read_csv('../input/test.tsv',delimiter='\t')
df_raw_train['log_price']=np.clip(np.log(1+df_raw_train['price']),1,5)

ins=df_raw_train.sample(frac=0.8,random_state=200)
oos=df_raw_train.drop(ins.index)
oot=get_features(df_raw_test)

In [71]:
def clean_category(x):
    if len(x)==0:
        return ''
    x=re.sub(chr(195),'e',x)
    x=re.sub(chr(169),'',x)
    x=x.translate(trans)
    x=re.sub(' +',' ',x)
    x=x.lower()
    x=' '.join(list(set(map(ps.stem,x.split(' ')))))
    x=re.sub('(\\b[A-Za-z] \\b|\\b [A-Za-z]\\b)', '',x)
    return x

def get_group_vars(df,x,threshold):
    df_group=df.groupby(x)[['count','log_price']].sum().reset_index()
    df_group['new_grp']=np.where(df_group['count']>threshold,df_group.index,-999)
    df_group_map=df_group[x+['new_grp']]
    df_group=df_group.groupby('new_grp')[['count','log_price']].sum()
    df_group['freq']=df_group['count']/df_group['count'].sum()
    df_group['avg_log_price']=df_group['log_price']/df_group['count']
    df_group_map['%s_freq'%'_'.join(x)]=df_group_map['new_grp'].map(df_group['freq'])
    df_group_map['%s_avg_price'%'_'.join(x)]=df_group_map['new_grp'].map(df_group['avg_log_price'])
    df=pd.merge(df,df_group_map[x+['%s_freq'%'_'.join(x),'%s_avg_price'%'_'.join(x)]],on=x)
    return df
  
cat_price=ins.groupby('category_name')['log_price'].apply(lambda x: {'log_price':x.sum(),'count':x.size}).unstack().reset_index()
cat_price['category0']=cat_price['category_name'].map(lambda x: str(x).split('/')[0]).map(clean_category)
cat_price['category1']=cat_price['category_name'].map(lambda x: str(x).split('/')[1] if str(x).count('/')>=1 else 'na').map(clean_category)
cat_price['category2']=cat_price['category_name'].map(lambda x: str(x).split('/')[2] if str(x).count('/')>=2 else 'na').map(clean_category)    
cat_price=get_group_vars(cat_price,['category0'],500)
cat_price=get_group_vars(cat_price,['category1'],500)
cat_price=get_group_vars(cat_price,['category2'],500)
cat_price=get_group_vars(cat_price,['category0','category1'],500)
cat_price=get_group_vars(cat_price,['category1','category2'],500)
cat_price=get_group_vars(cat_price,['category0','category2'],500)
cat_price=get_group_vars(cat_price,['category0','category1','category2'],500)
cat_cols=['category%d'%x for x in range(3)]
cat_price['cat_1_max_avg_price']=cat_price[['%s_avg_price'%x for x in cat_cols]].apply(lambda x: np.max(x),axis=1)
cat_price['cat_1_min_avg_price']=cat_price[['%s_avg_price'%x for x in cat_cols]].apply(lambda x: np.min(x),axis=1)
cat_price['cat_1_med_avg_price']=cat_price[['%s_avg_price'%x for x in cat_cols]].apply(lambda x: np.median(x),axis=1)
cat_price['cat_2_max_avg_price']=cat_price[['%s_%s_avg_price'%(x,y) for x in cat_cols for y in cat_cols if x<y]].apply(lambda x: np.max(x),axis=1)
cat_price['cat_2_min_avg_price']=cat_price[['%s_%s_avg_price'%(x,y) for x in cat_cols for y in cat_cols if x<y]].apply(lambda x: np.min(x),axis=1)
cat_price['cat_2_med_avg_price']=cat_price[['%s_%s_avg_price'%(x,y) for x in cat_cols for y in cat_cols if x<y]].apply(lambda x: np.median(x),axis=1)
cat_price=cat_price[[x for x in cat_price.columns if 'category_name' in x or '_freq' in x or '_avg_price' in x]]

brand_price=ins.groupby('brand_name')['log_price'].apply(lambda x: {'log_price':x.sum(),'count':x.size}).unstack().reset_index()
brand_price=get_group_vars(brand_price,['brand_name'],500)
brand_price=brand_price[[x for x in brand_price.columns if 'brand_name' in x or '_freq' in x or '_avg_price' in x]]


In [32]:
def get_condition(x):
    x=str(x).lower()
    x=re.sub('[^0-9a-zA-Z]+',' ',x)
    x=' '.join(list(set(map(ps.stem,x.split(' ')))))
    if 'descript yet no' in x:
        return 'no_description'
    elif 'new' in x or 'never' in x or 'nwot' in x or 'nwt' in x or 'onc' in x or 'twice' in x:
        return 'new'
    elif 'use' in x or 'flaw no' in x:
        return 'used'
    elif 'ship free' in x:
        return 'free_shipping'
    elif 'condit' in x:
        return 'used_good_condition'
    else:
        return 'other'
    
all_descriptions=pd.concat([df_raw_train['item_description'],df_raw_test['item_description']]).value_counts()
all_descriptions=pd.DataFrame(all_descriptions[all_descriptions>=100])
all_descriptions['description']=all_descriptions.index
all_descriptions=all_descriptions['description'].map(get_condition)

In [74]:
def get_features(df):
    df['condition']=df['item_description'].map(all_descriptions).fillna('other')
    df=pd.concat([df,pd.get_dummies(df['condition'])],axis=1)
    df=pd.merge(df,cat_price,on='category_name',how='left')
    df=pd.merge(df,brand_price,on='brand_name',how='left')
    df['words_in_description']=df['item_description'].astype(np.str).map(lambda x: x.count(' '))
    df['chars_in_description']=df['item_description'].astype(np.str).map(lambda x: len(x))
    df['words_in_name']=df['name'].map(lambda x: x.count(' '))
    df['chars_in_name']=df['name'].map(lambda x: len(x))
    df['cost_in_description']=df['item_description'].astype(np.str).map(lambda x: 1 if '[rm]' in x else 0)
    return df

ins_w_features=get_features(ins)
oos_w_features=get_features(oos)
oot_w_features=get_features(oot)


In [75]:
features=['item_condition_id','shipping']+ins_w_features.columns.tolist()[10:]
means=ins_w_features[features].mean()
ins_w_features=ins_w_features.fillna(means)
oos_w_features=oos_w_features.fillna(means)
oot_w_features=oot_w_features.fillna(means)


In [45]:
from sklearn.linear_model import Lasso
pd.set_option('display.float_format', lambda x: '%.4f' % x)

performance=[]
for alpha in [0,0.001,0.003,0.01,0.03,0.1,0.3,1]:
    reg=Lasso(alpha=alpha)
    reg.fit(ins_w_features[features],ins_w_features['log_price'])
    performance.append([alpha,reg.score(ins_w_features[features],ins_w_features['log_price']), reg.score(oos_w_features[features],oos_w_features['log_price'])])
    
pd.DataFrame(performance,columns=['alpha','train','test'])



Unnamed: 0,alpha,train,test
0,0.0,0.2945,0.2952
1,0.001,0.2901,0.2903
2,0.003,0.2875,0.2876
3,0.01,0.2768,0.2771
4,0.03,0.2317,0.2317
5,0.1,0.0022,0.002
6,0.3,0.0007,0.0006
7,1.0,0.0007,0.0006


In [46]:
reg=Lasso(alpha=0.01)
reg.fit(ins_w_features[features],ins_w_features['log_price'])
print pd.Series(reg.coef_,index=features).sort_values()

shipping                                  -0.2624
item_condition_id                         -0.0466
brand_name_freq                           -0.0000
cat_2_med_avg_price                        0.0000
cat_2_min_avg_price                        0.0000
cat_1_med_avg_price                        0.0000
cat_1_min_avg_price                        0.0000
cat_1_max_avg_price                        0.0000
category0_category1_category2_avg_price    0.0000
category0_category1_category2_freq        -0.0000
category0_category2_avg_price              0.0000
category0_category2_freq                  -0.0000
category1_category2_freq                  -0.0000
category0_category1_avg_price              0.0000
category1_category2_avg_price              0.0000
other                                      0.0000
free_shipping                             -0.0000
new                                       -0.0000
no_description                            -0.0000
category0_category1_freq                  -0.0000


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

parameters = {'n_estimators':10,'max_depth':np.arange(3,10,2)}
rfr=RandomForestRegressor(min_samples_leaf=5000,n_jobs=1,oob_score=True)
reg = GridSearchCV(rfr, parameters, n_jobs=-1)
reg.fit(ins_w_features[features],ins_w_features['log_price'])
reg.grid_scores_

In [78]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

rf_features=['category1_category2_avg_price','brand_name_avg_price','shipping','cat_2_med_avg_price',\
             'item_condition_id','cat_2_max_avg_price','chars_in_description','cat_2_min_avg_price',\
             'cat_1_min_avg_price','category0_category1_category2_avg_price','brand_name_freq']

rfr=RandomForestRegressor(n_estimators=5,max_depth=10,min_samples_leaf=500,n_jobs=-1,oob_score=True)
rfr.fit(ins_w_features[rf_features],ins_w_features['log_price'])
print rfr.score(ins_w_features[rf_features],ins_w_features['log_price'])
print rfr.score(oos_w_features[rf_features],oos_w_features['log_price'])

pd.Series(rfr.feature_importances_,index=rf_features).sort_values(ascending=False).cumsum()

0.401115985616
0.399958014223


category1_category2_avg_price             0.2633
brand_name_avg_price                      0.5143
cat_2_max_avg_price                       0.6913
shipping                                  0.7943
cat_2_med_avg_price                       0.8763
item_condition_id                         0.9211
cat_2_min_avg_price                       0.9449
chars_in_description                      0.9676
cat_1_min_avg_price                       0.9839
brand_name_freq                           0.9941
category0_category1_category2_avg_price   1.0000
dtype: float64

In [84]:
from sklearn.feature_extraction.text import TfidfVectorizer
#count_descp = TfidfVectorizer(max_features = 50000,ngram_range = (1,3),stop_words = "english")
#X=count_descp.fit_transform(ins_w_features['item_description'].fillna('No description yet'))
from sklearn.linear_model import Lasso
model = Lasso(alpha=0.01)
print("Fitting Model")
model.fit(X, ins_w_features['log_price'])



Fitting Model


KeyboardInterrupt: 

In [86]:
count_descp.get_feature_names()

[u'00',
 u'00 00',
 u'00 advantage',
 u'00 advantage kindness',
 u'00 bundles',
 u'00 bundles 45',
 u'00 buy',
 u'00 buy pair',
 u'00 free',
 u'00 free shipping',
 u'00 pm',
 u'00 pm pacific',
 u'00 purchased',
 u'00 purchased free',
 u'00 remember',
 u'00 remember purchasing',
 u'00 shipping',
 u'00 sizing',
 u'00 sizing currently',
 u'000',
 u'001',
 u'002',
 u'003',
 u'007',
 u'00ct',
 u'00pm',
 u'01',
 u'01 oz',
 u'01 product',
 u'01 product details',
 u'01 product detials',
 u'010',
 u'02',
 u'02 authentic',
 u'02 authentic yeah',
 u'02 choose',
 u'02 choose color',
 u'02 fl',
 u'02 fl oz',
 u'02 long',
 u'02 long order',
 u'02 oz',
 u'03',
 u'03 2020',
 u'03 authentic',
 u'03 authentic new',
 u'03 authentic yeah',
 u'03 authentic yes',
 u'03 long',
 u'03 long order',
 u'03 oz',
 u'03 oz 0g',
 u'04',
 u'04 better',
 u'04 better price',
 u'04 buy',
 u'04 buy item',
 u'04 long',
 u'04 long order',
 u'04 oz',
 u'05',
 u'05 fl',
 u'05 fl oz',
 u'05 long',
 u'05 long order',
 u'05 oz',

In [95]:
def clean_description(x):
    if len(x)==0:
        return ''
    x=x.lower()
    return x

from collections import Counter
char_count=Counter([y for x in ins_w_features['item_description'].astype(np.str) for y in x])

In [100]:
pd.Series(char_count).sort_values(ascending=False).shape

(197,)